diff options
Diffstat (limited to 'test')
182 files changed, 9291 insertions, 7612 deletions
diff --git a/test/Analysis/BasicAA/args-rets-allocas-loads.ll b/test/Analysis/BasicAA/args-rets-allocas-loads.ll index 05b56a07e44b..b31fb26f1c9b 100644 --- a/test/Analysis/BasicAA/args-rets-allocas-loads.ll +++ b/test/Analysis/BasicAA/args-rets-allocas-loads.ll @@ -308,4 +308,9 @@ define void @caller_a(double* %arg_a0, ; CHECK-NEXT: 0 mod responses (0.0%) ; CHECK-NEXT: 0 ref responses (0.0%) ; CHECK-NEXT: 140 mod & ref responses (76.0%) -; CHECK-NEXT: Alias Analysis Evaluator Mod/Ref Summary: 23%/0%/0%/76% +; CHECK-NEXT: 0 must responses (0.0%) +; CHECK-NEXT: 0 must mod responses (0.0%) +; CHECK-NEXT: 0 must ref responses (0.0%) +; CHECK-NEXT: 0 must mod & ref responses (0.0%) +; CHECK-NEXT: Alias Analysis Evaluator Mod/Ref Summary: 23%/0%/0%/76%/0%/0%/0%/0% + diff --git a/test/Analysis/BasicAA/call-attrs.ll b/test/Analysis/BasicAA/call-attrs.ll index 9cd17e486799..8538e8b4771d 100644 --- a/test/Analysis/BasicAA/call-attrs.ll +++ b/test/Analysis/BasicAA/call-attrs.ll @@ -31,12 +31,12 @@ entry: ret void } -; CHECK: Just Ref: Ptr: i8* %p <-> call void @readonly_attr(i8* %p) +; CHECK: Just Ref (MustAlias): Ptr: i8* %p <-> call void @readonly_attr(i8* %p) ; CHECK: Just Ref: Ptr: i8* %p <-> call void @readonly_func(i8* %p) -; CHECK: Just Mod: Ptr: i8* %p <-> call void @writeonly_attr(i8* %p) +; CHECK: Just Mod (MustAlias): Ptr: i8* %p <-> call void @writeonly_attr(i8* %p) ; CHECK: Just Mod: Ptr: i8* %p <-> call void @writeonly_func(i8* %p) ; CHECK: NoModRef: Ptr: i8* %p <-> call void @readnone_attr(i8* %p) ; CHECK: NoModRef: Ptr: i8* %p <-> call void @readnone_func(i8* %p) ; CHECK: Both ModRef: Ptr: i8* %p <-> call void @read_write(i8* %p, i8* %p, i8* %p) -; CHECK: Just Ref: Ptr: i8* %p <-> call void @func() [ "deopt"(i8* %p) ] +; CHECK: Just Ref (MustAlias): Ptr: i8* %p <-> call void @func() [ "deopt"(i8* %p) ] ; CHECK: Both ModRef: Ptr: i8* %p <-> call void @writeonly_attr(i8* %p) [ "deopt"(i8* %p) ] diff --git a/test/Analysis/BasicAA/cs-cs-arm.ll b/test/Analysis/BasicAA/cs-cs-arm.ll index 1580af9ea826..e4367bb6d61b 100644 --- a/test/Analysis/BasicAA/cs-cs-arm.ll +++ b/test/Analysis/BasicAA/cs-cs-arm.ll @@ -19,11 +19,11 @@ entry: ; CHECK-LABEL: Function: test1: ; CHECK: NoAlias: i8* %p, i8* %q -; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) +; CHECK: Just Ref (MustAlias): Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) ; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) ; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) -; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) -; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) +; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK: Just Ref (MustAlias): Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) ; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) ; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) ; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) diff --git a/test/Analysis/BasicAA/cs-cs.ll b/test/Analysis/BasicAA/cs-cs.ll index 3695275649b2..30b7e0720f0b 100644 --- a/test/Analysis/BasicAA/cs-cs.ll +++ b/test/Analysis/BasicAA/cs-cs.ll @@ -164,7 +164,7 @@ define void @test4(i8* %P, i8* noalias %Q) #3 { ; CHECK-LABEL: Function: test4: ; CHECK: NoAlias: i8* %P, i8* %Q -; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false) +; CHECK: Just Mod (MustAlias): Ptr: i8* %P <-> tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false) ; CHECK: NoModRef: Ptr: i8* %Q <-> tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i32 1, i1 false) ; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) ; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) @@ -192,6 +192,26 @@ define void @test5(i8* %P, i8* %Q, i8* %R) #3 { ; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) } +define void @test5a(i8* noalias %P, i8* noalias %Q, i8* noalias %R) nounwind ssp { + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) + ret void + +; CHECK-LABEL: Function: test5a: + +; CHECK: NoAlias: i8* %P, i8* %Q +; CHECK: NoAlias: i8* %P, i8* %R +; CHECK: NoAlias: i8* %Q, i8* %R +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: NoModRef: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +; CHECK: Just Mod: Ptr: i8* %P <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) +; CHECK: NoModRef: Ptr: i8* %Q <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) +; CHECK: Just Ref: Ptr: i8* %R <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) +; CHECK: Just Mod: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i32 1, i1 false) <-> tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) +} + define void @test6(i8* %P) #3 { call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) call void @a_readonly_func(i8* %P) @@ -199,7 +219,7 @@ define void @test6(i8* %P) #3 { ; CHECK-LABEL: Function: test6: -; CHECK: Just Mod: Ptr: i8* %P <-> call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) +; CHECK: Just Mod (MustAlias): Ptr: i8* %P <-> call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) ; CHECK: Just Ref: Ptr: i8* %P <-> call void @a_readonly_func(i8* %P) ; CHECK: Just Mod: call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) <-> call void @a_readonly_func(i8* %P) ; CHECK: Just Ref: call void @a_readonly_func(i8* %P) <-> call void @llvm.memset.p0i8.i64(i8* %P, i8 -51, i64 32, i32 8, i1 false) @@ -237,9 +257,9 @@ entry: ; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessiblememonly_func() ; CHECK: NoModRef: Ptr: i8* %q <-> call void @an_inaccessiblememonly_func() ; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessibleorargmemonly_func(i8* %q) -; CHECK: Both ModRef: Ptr: i8* %q <-> call void @an_inaccessibleorargmemonly_func(i8* %q) +; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @an_inaccessibleorargmemonly_func(i8* %q) ; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_argmemonly_func(i8* %q) -; CHECK: Both ModRef: Ptr: i8* %q <-> call void @an_argmemonly_func(i8* %q) +; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @an_argmemonly_func(i8* %q) ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) <-> call void @an_inaccessiblememonly_func() ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) <-> call void @an_inaccessibleorargmemonly_func(i8* %q) ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) <-> call void @an_argmemonly_func(i8* %q) @@ -254,12 +274,34 @@ entry: ; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @a_readonly_func(i8* %p) ; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @a_writeonly_func(i8* %q) ; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @an_inaccessiblememonly_func() -; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @an_argmemonly_func(i8* %q) +; CHECK: Both ModRef (MustAlias): call void @an_inaccessibleorargmemonly_func(i8* %q) <-> call void @an_argmemonly_func(i8* %q) ; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) <-> call void @a_readonly_func(i8* %p) ; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) <-> call void @a_writeonly_func(i8* %q) ; CHECK: NoModRef: call void @an_argmemonly_func(i8* %q) <-> call void @an_inaccessiblememonly_func() -; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) <-> call void @an_inaccessibleorargmemonly_func(i8* %q) +; CHECK: Both ModRef (MustAlias): call void @an_argmemonly_func(i8* %q) <-> call void @an_inaccessibleorargmemonly_func(i8* %q) +} + +;; test that MustAlias is set for calls when no MayAlias is found. +declare void @another_argmemonly_func(i8*, i8*) #0 +define void @test8a(i8* noalias %p, i8* noalias %q) { +entry: + call void @another_argmemonly_func(i8* %p, i8* %q) + ret void + +; CHECK-LABEL: Function: test8a +; CHECK: Both ModRef: Ptr: i8* %p <-> call void @another_argmemonly_func(i8* %p, i8* %q) +; CHECK: Both ModRef: Ptr: i8* %q <-> call void @another_argmemonly_func(i8* %p, i8* %q) } +define void @test8b(i8* %p, i8* %q) { +entry: + call void @another_argmemonly_func(i8* %p, i8* %q) + ret void + +; CHECK-LABEL: Function: test8b +; CHECK: Both ModRef: Ptr: i8* %p <-> call void @another_argmemonly_func(i8* %p, i8* %q) +; CHECK: Both ModRef: Ptr: i8* %q <-> call void @another_argmemonly_func(i8* %p, i8* %q) +} + ;; test that unknown operand bundle has unknown effect to the heap define void @test9(i8* %p) { @@ -310,9 +352,9 @@ entry: ; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] ; CHECK: NoModRef: Ptr: i8* %q <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] ; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] -; CHECK: Both ModRef: Ptr: i8* %q <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] +; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] ; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] -; CHECK: Both ModRef: Ptr: i8* %q <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] +; CHECK: Both ModRef (MustAlias): Ptr: i8* %q <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] ; CHECK: Just Ref: call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] @@ -321,10 +363,10 @@ entry: ; CHECK: NoModRef: call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] ; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <-> call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] ; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] -; CHECK: Both ModRef: call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] +; CHECK: Both ModRef (MustAlias): call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] <-> call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] ; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <-> call void @a_readonly_func(i8* %p) #6 [ "unknown"() ] ; CHECK: NoModRef: call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <-> call void @an_inaccessiblememonly_func() #7 [ "unknown"() ] -; CHECK: Both ModRef: call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] +; CHECK: Both ModRef (MustAlias): call void @an_argmemonly_func(i8* %q) #9 [ "unknown"() ] <-> call void @an_inaccessibleorargmemonly_func(i8* %q) #8 [ "unknown"() ] } attributes #0 = { argmemonly nounwind } diff --git a/test/Analysis/MemorySSA/volatile-clobber.ll b/test/Analysis/MemorySSA/volatile-clobber.ll index d6f960f3e382..53df7de499bd 100644 --- a/test/Analysis/MemorySSA/volatile-clobber.ll +++ b/test/Analysis/MemorySSA/volatile-clobber.ll @@ -22,8 +22,7 @@ define i32 @foo() { ret i32 %4 } -; Ensuring that we don't automatically hoist nonvolatile loads around volatile -; loads +; Ensuring we allow hoisting nonvolatile loads around volatile loads. ; CHECK-LABEL define void @volatile_only define void @volatile_only(i32* %arg1, i32* %arg2) { ; Trivially NoAlias/MustAlias @@ -36,7 +35,7 @@ define void @volatile_only(i32* %arg1, i32* %arg2) { ; CHECK: MemoryUse(liveOnEntry) ; CHECK-NEXT: load i32, i32* %b load i32, i32* %b -; CHECK: MemoryUse(1) +; CHECK: MemoryUse(liveOnEntry) ; CHECK-NEXT: load i32, i32* %a load i32, i32* %a @@ -44,7 +43,7 @@ define void @volatile_only(i32* %arg1, i32* %arg2) { ; CHECK: 2 = MemoryDef(1) ; CHECK-NEXT: load volatile i32, i32* %arg1 load volatile i32, i32* %arg1 -; CHECK: MemoryUse(2) +; CHECK: MemoryUse(liveOnEntry) ; CHECK-NEXT: load i32, i32* %arg2 load i32, i32* %arg2 @@ -75,10 +74,10 @@ define void @volatile_atomics(i32* %arg1, i32* %arg2) { ; CHECK: MemoryUse(1) ; CHECK-NEXT: load atomic i32, i32* %b unordered, align 4 load atomic i32, i32* %b unordered, align 4 -; CHECK: MemoryUse(2) +; CHECK: MemoryUse(1) ; CHECK-NEXT: load atomic i32, i32* %a unordered, align 4 load atomic i32, i32* %a unordered, align 4 -; CHECK: MemoryUse(2) +; CHECK: MemoryUse(1) ; CHECK-NEXT: load i32, i32* %a load i32, i32* %a @@ -86,7 +85,7 @@ define void @volatile_atomics(i32* %arg1, i32* %arg2) { ; CHECK: 3 = MemoryDef(2) ; CHECK-NEXT: load atomic volatile i32, i32* %arg1 monotonic, align 4 load atomic volatile i32, i32* %arg1 monotonic, align 4 -; CHECK: MemoryUse(3) +; CHECK: MemoryUse(1) ; CHECK-NEXT: load i32, i32* %arg2 load i32, i32* %arg2 diff --git a/test/Analysis/ValueTracking/memory-dereferenceable.ll b/test/Analysis/ValueTracking/memory-dereferenceable.ll index ca16a266123c..2e9453f670ce 100644 --- a/test/Analysis/ValueTracking/memory-dereferenceable.ll +++ b/test/Analysis/ValueTracking/memory-dereferenceable.ll @@ -20,7 +20,8 @@ declare i32* @foo() @globalptr.align16 = external global i8, align 16 ; CHECK-LABEL: 'test' -define void @test(i32 addrspace(1)* dereferenceable(8) %dparam, +define void @test(%struct.A* sret %result, + i32 addrspace(1)* dereferenceable(8) %dparam, i8 addrspace(1)* dereferenceable(32) align 1 %dparam.align1, i8 addrspace(1)* dereferenceable(32) align 16 %dparam.align16, i8* byval %i8_byval, @@ -41,17 +42,14 @@ entry: %empty_alloca = alloca i8, i64 0 %empty_load = load i8, i8* %empty_alloca - ; Load from too small array alloca -; CHECK-NOT: %small_array_alloca - %small_array_alloca = alloca i8, i64 2 - %saa_cast = bitcast i8* %small_array_alloca to i32* - %saa_load = load i32, i32* %saa_cast - - ; Load from array alloca -; CHECK: %big_array_alloca{{.*}}(unaligned) - %big_array_alloca = alloca i8, i64 4 - %baa_cast = bitcast i8* %big_array_alloca to i32* - %baa_load = load i32, i32* %baa_cast + ; Loads from sret arguments +; CHECK: %sret_gep{{.*}}(aligned) + %sret_gep = getelementptr inbounds %struct.A, %struct.A* %result, i64 0, i32 1, i64 2 + load i8, i8* %sret_gep + +; CHECK-NOT: %sret_gep_outside + %sret_gep_outside = getelementptr %struct.A, %struct.A* %result, i64 0, i32 1, i64 7 + load i8, i8* %sret_gep_outside ; CHECK: %dparam{{.*}}(aligned) %load3 = load i32, i32 addrspace(1)* %dparam diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll index 86ac5507a407..111aaf88b160 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -135,7 +135,7 @@ continue: } ; Check that we fallback on invoke translation failures. -; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %0:_(s128) = G_FCONSTANT quad 2 +; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %0:_(s128) = G_FCONSTANT fp128 0xL00000000000000004000000000000000 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_quad_dump ; FALLBACK-WITH-REPORT-OUT-LABEL: test_quad_dump: define fp128 @test_quad_dump() { diff --git a/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir b/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir new file mode 100644 index 000000000000..47fda8f998d7 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir @@ -0,0 +1,44 @@ +# RUN: llc -O0 -run-pass=legalizer -global-isel -global-isel-abort=0 %s -o - | FileCheck %s +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64" + + define fp128 @x(fp128 %a) { + entry: + %a.addr = alloca fp128, align 16 + store fp128 %a, fp128* %a.addr, align 16 + %0 = load fp128, fp128* %a.addr, align 16 + %sub = fsub fp128 0xL00000000000000008000000000000000, %0 + ret fp128 %sub + } + +... +--- +name: x +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +fixedStack: +stack: + - { id: 0, name: a.addr, type: default, offset: 0, size: 16, alignment: 16, + stack-id: 0, callee-saved-register: '', callee-saved-restored: true, + di-variable: '', di-expression: '', di-location: '' } +body: | + bb.1.entry: + liveins: %q0 + + ; This test just checks we don't crash on G_FNEG of FP128 types. Expect to fall + ; back until support is added for fp128. + ; CHECK: ret + %0:_(s128) = COPY %q0 + %1:_(p0) = G_FRAME_INDEX %stack.0.a.addr + G_STORE %0(s128), %1(p0) :: (store 16 into %ir.a.addr) + %2:_(s128) = G_LOAD %1(p0) :: (load 16 from %ir.a.addr) + %3:_(s128) = G_FNEG %2 + %q0 = COPY %3(s128) + RET_ReallyLR implicit %q0 + +... diff --git a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll index 865315bbe0a3..4b69575079a3 100644 --- a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll +++ b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll @@ -1,85 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -O0 -global-isel -stop-after=irtranslator -o - %s | FileCheck %s %type = type [4 x {i8, i32}] define %type* @first_offset_const(%type* %addr) { -; CHECK-LABEL: name: first_offset_const -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 -; CHECK: [[RES:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64) -; CHECK: %x0 = COPY [[RES]](p0) + ; CHECK-LABEL: name: first_offset_const + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: %x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) + ; CHECK: %x0 = COPY [[GEP]](p0) + ; CHECK: RET_ReallyLR implicit %x0 %res = getelementptr %type, %type* %addr, i32 1 ret %type* %res } define %type* @first_offset_trivial(%type* %addr) { -; CHECK-LABEL: name: first_offset_trivial -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[TRIVIAL:%[0-9]+]]:_(p0) = COPY [[BASE]](p0) -; CHECK: %x0 = COPY [[TRIVIAL]](p0) + ; CHECK-LABEL: name: first_offset_trivial + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: %x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY [[COPY]](p0) + ; CHECK: %x0 = COPY [[COPY1]](p0) + ; CHECK: RET_ReallyLR implicit %x0 %res = getelementptr %type, %type* %addr, i32 0 ret %type* %res } define %type* @first_offset_variable(%type* %addr, i64 %idx) { -; CHECK-LABEL: name: first_offset_variable -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1 -; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 -; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]] -; CHECK: [[STEP0:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64) -; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[STEP0]](p0) -; CHECK: %x0 = COPY [[RES]](p0) + ; CHECK-LABEL: name: first_offset_variable + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: %x0, %x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %x1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[COPY1]] + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64) + ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP]](p0) + ; CHECK: %x0 = COPY [[COPY2]](p0) + ; CHECK: RET_ReallyLR implicit %x0 %res = getelementptr %type, %type* %addr, i64 %idx ret %type* %res } define %type* @first_offset_ext(%type* %addr, i32 %idx) { -; CHECK-LABEL: name: first_offset_ext -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[IDX32:%[0-9]+]]:_(s32) = COPY %w1 -; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 -; CHECK: [[IDX64:%[0-9]+]]:_(s64) = G_SEXT [[IDX32]](s32) -; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX64]] -; CHECK: [[STEP0:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64) -; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[STEP0]](p0) -; CHECK: %x0 = COPY [[RES]](p0) + ; CHECK-LABEL: name: first_offset_ext + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: %w1, %x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %w1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32) + ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[SEXT]] + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64) + ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP]](p0) + ; CHECK: %x0 = COPY [[COPY2]](p0) + ; CHECK: RET_ReallyLR implicit %x0 %res = getelementptr %type, %type* %addr, i32 %idx ret %type* %res } %type1 = type [4 x [4 x i32]] define i32* @const_then_var(%type1* %addr, i64 %idx) { -; CHECK-LABEL: name: const_then_var -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1 -; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_CONSTANT i64 272 -; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 -; CHECK: [[BASE1:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET1]](s64) -; CHECK: [[OFFSET2:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]] -; CHECK: [[BASE2:%[0-9]+]]:_(p0) = G_GEP [[BASE1]], [[OFFSET2]](s64) -; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[BASE2]](p0) -; CHECK: %x0 = COPY [[RES]](p0) + ; CHECK-LABEL: name: const_then_var + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: %x0, %x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %x1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 272 + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) + ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C1]], [[COPY1]] + ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[GEP]], [[MUL]](s64) + ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP1]](p0) + ; CHECK: %x0 = COPY [[COPY2]](p0) + ; CHECK: RET_ReallyLR implicit %x0 %res = getelementptr %type1, %type1* %addr, i32 4, i32 1, i64 %idx ret i32* %res } define i32* @var_then_const(%type1* %addr, i64 %idx) { -; CHECK-LABEL: name: var_then_const -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1 -; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 -; CHECK: [[OFFSET2:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 -; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]] -; CHECK: [[BASE1:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET1]](s64) -; CHECK: [[BASE2:%[0-9]+]]:_(p0) = G_GEP [[BASE1]], [[OFFSET2]](s64) -; CHECK: %x0 = COPY [[BASE2]](p0) + ; CHECK-LABEL: name: var_then_const + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: %x0, %x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %x1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 + ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[COPY1]] + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64) + ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[GEP]], [[C1]](s64) + ; CHECK: %x0 = COPY [[GEP1]](p0) + ; CHECK: RET_ReallyLR implicit %x0 %res = getelementptr %type1, %type1* %addr, i64 %idx, i32 2, i32 2 ret i32* %res } diff --git a/test/CodeGen/AArch64/arm64-jumptable.ll b/test/CodeGen/AArch64/arm64-jumptable.ll index f5c2ee6da0bf..fac3e5704d15 100644 --- a/test/CodeGen/AArch64/arm64-jumptable.ll +++ b/test/CodeGen/AArch64/arm64-jumptable.ll @@ -6,22 +6,20 @@ define void @sum(i32 %a, i32* %to, i32 %c) { entry: switch i32 %a, label %exit [ i32 1, label %bb1 - i32 2, label %bb2 + i32 2, label %exit.sink.split i32 3, label %bb3 i32 4, label %bb4 ] bb1: %b = add i32 %c, 1 - store i32 %b, i32* %to - br label %exit -bb2: - store i32 2, i32* %to - br label %exit + br label %exit.sink.split bb3: - store i32 3, i32* %to - br label %exit + br label %exit.sink.split bb4: - store i32 5, i32* %to + br label %exit.sink.split +exit.sink.split: + %.sink = phi i32 [ 5, %bb4 ], [ %b, %bb1 ], [ 3, %bb3 ], [ %a, %entry ] + store i32 %.sink, i32* %to br label %exit exit: ret void diff --git a/test/CodeGen/AArch64/arm64-memset-to-bzero.ll b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll index 29036caabf3a..3466e1bace56 100644 --- a/test/CodeGen/AArch64/arm64-memset-to-bzero.ll +++ b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll @@ -4,9 +4,10 @@ ; RUN: FileCheck --check-prefix=CHECK-LINUX --check-prefix=CHECK %s ; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset() -; CHECK: @fct1 +; CHECK-LABEL: fct1: ; For small size (<= 256), we do not change memset to bzero. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset define void @fct1(i8* nocapture %ptr) { entry: tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i32 1, i1 false) @@ -15,20 +16,20 @@ entry: declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) -; CHECK: @fct2 +; CHECK-LABEL: fct2: ; When the size is bigger than 256, change into bzero. -; CHECK-DARWIN: bzero -; CHECK-LINUX: memset +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset define void @fct2(i8* nocapture %ptr) { entry: tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false) ret void } -; CHECK: @fct3 +; CHECK-LABEL: fct3: ; For unknown size, change to bzero. -; CHECK-DARWIN: bzero -; CHECK-LINUX: memset +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset define void @fct3(i8* nocapture %ptr, i32 %unknown) { entry: %conv = sext i32 %unknown to i64 @@ -36,9 +37,10 @@ entry: ret void } -; CHECK: @fct4 +; CHECK-LABEL: fct4: ; Size <= 256, no change. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset define void @fct4(i8* %ptr) { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -50,10 +52,10 @@ declare i8* @__memset_chk(i8*, i32, i64, i64) declare i64 @llvm.objectsize.i64(i8*, i1) -; CHECK: @fct5 +; CHECK-LABEL: fct5: ; Size > 256, change. -; CHECK-DARWIN: bzero -; CHECK-LINUX: memset +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset define void @fct5(i8* %ptr) { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -61,10 +63,10 @@ entry: ret void } -; CHECK: @fct6 +; CHECK-LABEL: fct6: ; Size = unknown, change. -; CHECK-DARWIN: bzero -; CHECK-LINUX: memset +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset define void @fct6(i8* %ptr, i32 %unknown) { entry: %conv = sext i32 %unknown to i64 @@ -76,9 +78,10 @@ entry: ; Next functions check that memset is not turned into bzero ; when the set constant is non-zero, whatever the given size. -; CHECK: @fct7 +; CHECK-LABEL: fct7: ; memset with something that is not a zero, no change. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset define void @fct7(i8* %ptr) { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -86,9 +89,10 @@ entry: ret void } -; CHECK: @fct8 +; CHECK-LABEL: fct8: ; memset with something that is not a zero, no change. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset define void @fct8(i8* %ptr) { entry: %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -96,9 +100,10 @@ entry: ret void } -; CHECK: @fct9 +; CHECK-LABEL: fct9: ; memset with something that is not a zero, no change. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset define void @fct9(i8* %ptr, i32 %unknown) { entry: %conv = sext i32 %unknown to i64 diff --git a/test/CodeGen/AArch64/arm64-neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll index d22bfc76d1d5..b3a2bcd5d669 100644 --- a/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s -; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck --check-prefix=EXYNOS %s +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,GENERIC ; The instruction latencies of Exynos-M1 trigger the transform we see under the Exynos check. +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck %s --check-prefixes=CHECK,EXYNOSM1 declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) @@ -47,7 +47,6 @@ declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmla_lane_s16: ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %mul = mul <4 x i16> %shuffle, %b @@ -58,7 +57,6 @@ entry: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlaq_lane_s16: ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> %mul = mul <8 x i16> %shuffle, %b @@ -69,7 +67,6 @@ entry: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmla_lane_s32: ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %mul = mul <2 x i32> %shuffle, %b @@ -80,7 +77,6 @@ entry: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlaq_lane_s32: ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %mul = mul <4 x i32> %shuffle, %b @@ -91,7 +87,6 @@ entry: define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmla_laneq_s16: ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> %mul = mul <4 x i16> %shuffle, %b @@ -102,7 +97,6 @@ entry: define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlaq_laneq_s16: ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> %mul = mul <8 x i16> %shuffle, %b @@ -113,7 +107,6 @@ entry: define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmla_laneq_s32: ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %mul = mul <2 x i32> %shuffle, %b @@ -124,7 +117,6 @@ entry: define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlaq_laneq_s32: ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %mul = mul <4 x i32> %shuffle, %b @@ -135,7 +127,6 @@ entry: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmls_lane_s16: ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %mul = mul <4 x i16> %shuffle, %b @@ -146,7 +137,6 @@ entry: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsq_lane_s16: ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> %mul = mul <8 x i16> %shuffle, %b @@ -157,7 +147,6 @@ entry: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmls_lane_s32: ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %mul = mul <2 x i32> %shuffle, %b @@ -168,7 +157,6 @@ entry: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsq_lane_s32: ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %mul = mul <4 x i32> %shuffle, %b @@ -179,7 +167,6 @@ entry: define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmls_laneq_s16: ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> %mul = mul <4 x i16> %shuffle, %b @@ -190,7 +177,6 @@ entry: define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsq_laneq_s16: ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> %mul = mul <8 x i16> %shuffle, %b @@ -201,7 +187,6 @@ entry: define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmls_laneq_s32: ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %mul = mul <2 x i32> %shuffle, %b @@ -212,7 +197,6 @@ entry: define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsq_laneq_s32: ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %mul = mul <4 x i32> %shuffle, %b @@ -223,7 +207,6 @@ entry: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmul_lane_s16: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %mul = mul <4 x i16> %shuffle, %a @@ -233,7 +216,6 @@ entry: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmulq_lane_s16: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> %mul = mul <8 x i16> %shuffle, %a @@ -243,7 +225,6 @@ entry: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmul_lane_s32: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %mul = mul <2 x i32> %shuffle, %a @@ -253,7 +234,6 @@ entry: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmulq_lane_s32: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %mul = mul <4 x i32> %shuffle, %a @@ -263,7 +243,6 @@ entry: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmul_lane_u16: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %mul = mul <4 x i16> %shuffle, %a @@ -273,7 +252,6 @@ entry: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmulq_lane_u16: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> %mul = mul <8 x i16> %shuffle, %a @@ -283,7 +261,6 @@ entry: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmul_lane_u32: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %mul = mul <2 x i32> %shuffle, %a @@ -293,7 +270,6 @@ entry: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmulq_lane_u32: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %mul = mul <4 x i32> %shuffle, %a @@ -303,7 +279,6 @@ entry: define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmul_laneq_s16: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> %mul = mul <4 x i16> %shuffle, %a @@ -313,7 +288,6 @@ entry: define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmulq_laneq_s16: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> %mul = mul <8 x i16> %shuffle, %a @@ -323,7 +297,6 @@ entry: define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmul_laneq_s32: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %mul = mul <2 x i32> %shuffle, %a @@ -333,7 +306,6 @@ entry: define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmulq_laneq_s32: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %mul = mul <4 x i32> %shuffle, %a @@ -343,7 +315,6 @@ entry: define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmul_laneq_u16: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> %mul = mul <4 x i16> %shuffle, %a @@ -353,7 +324,6 @@ entry: define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmulq_laneq_u16: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7> %mul = mul <8 x i16> %shuffle, %a @@ -363,7 +333,6 @@ entry: define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmul_laneq_u32: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %mul = mul <2 x i32> %shuffle, %a @@ -373,7 +342,6 @@ entry: define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmulq_laneq_u32: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %mul = mul <4 x i32> %shuffle, %a @@ -382,12 +350,9 @@ entry: define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfma_lane_f32: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfma_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -398,12 +363,9 @@ declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmaq_lane_f32: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -414,12 +376,9 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfma_laneq_f32: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfma_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] -; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -428,12 +387,9 @@ entry: define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmaq_laneq_f32: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -442,12 +398,9 @@ entry: define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfms_lane_f32: -; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfms_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1> @@ -457,12 +410,9 @@ entry: define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmsq_lane_f32: -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> @@ -472,12 +422,9 @@ entry: define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfms_laneq_f32: -; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfms_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] -; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3> @@ -487,12 +434,9 @@ entry: define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmsq_laneq_f32: -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -502,12 +446,9 @@ entry: define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) { ; CHECK-LABEL: test_vfmaq_lane_f64: -; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_lane_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -518,12 +459,9 @@ declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmaq_laneq_f64: -; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_laneq_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] -; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1> %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -532,12 +470,9 @@ entry: define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) { ; CHECK-LABEL: test_vfmsq_lane_f64: -; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_lane_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %sub = fsub <1 x double> <double -0.000000e+00>, %v %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer @@ -547,12 +482,9 @@ entry: define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmsq_laneq_f64: -; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_laneq_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] -; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1> @@ -563,10 +495,6 @@ entry: define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmas_laneq_f32 ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXNOS-LABEL: test_vfmas_laneq_f32 -; EXNOS: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] -; EXNOS-NEXT: ret entry: %extract = extractelement <4 x float> %v, i32 3 %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) @@ -578,7 +506,6 @@ declare float @llvm.fma.f32(float, float, float) define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) { ; CHECK-LABEL: test_vfmsd_lane_f64 ; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; CHECK-NEXT: ret entry: %extract.rhs = extractelement <1 x double> %v, i32 0 %extract = fsub double -0.000000e+00, %extract.rhs @@ -591,10 +518,6 @@ declare double @llvm.fma.f64(double, double, double) define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmss_lane_f32 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmss_lane_f32 -; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] -; EXYNOS-NEXT: ret entry: %extract.rhs = extractelement <2 x float> %v, i32 1 %extract = fsub float -0.000000e+00, %extract.rhs @@ -605,7 +528,6 @@ entry: define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmss_laneq_f32 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %extract.rhs = extractelement <4 x float> %v, i32 3 %extract = fsub float -0.000000e+00, %extract.rhs @@ -616,10 +538,6 @@ entry: define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmsd_laneq_f64 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsd_laneq_f64 -; EXYNOS: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; EXYNOS-NEXT: ret entry: %extract.rhs = extractelement <2 x double> %v, i32 1 %extract = fsub double -0.000000e+00, %extract.rhs @@ -641,10 +559,6 @@ entry: define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmss_lane_f32_0 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmss_lane_f32_0 -; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] -; EXYNOS-NEXT: ret entry: %tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v %tmp1 = extractelement <2 x float> %tmp0, i32 1 @@ -655,7 +569,6 @@ entry: define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmss_laneq_f32_0 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %tmp0 = fsub <4 x float><float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v %tmp1 = extractelement <4 x float> %tmp0, i32 3 @@ -666,7 +579,6 @@ entry: define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmsd_laneq_f64_0 ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret entry: %tmp0 = fsub <2 x double><double -0.000000e+00, double -0.000000e+00>, %v %tmp1 = extractelement <2 x double> %tmp0, i32 1 @@ -677,7 +589,6 @@ entry: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_lane_s16: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -688,7 +599,6 @@ entry: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_lane_s32: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -699,7 +609,6 @@ entry: define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_laneq_s16: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -710,7 +619,6 @@ entry: define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_laneq_s32: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -721,7 +629,6 @@ entry: define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_lane_s16: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -733,7 +640,6 @@ entry: define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_lane_s32: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -745,7 +651,6 @@ entry: define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_s16: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -757,7 +662,6 @@ entry: define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_s32: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -769,7 +673,6 @@ entry: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_lane_s16: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -780,7 +683,6 @@ entry: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_lane_s32: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -791,7 +693,6 @@ entry: define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_laneq_s16: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -802,7 +703,6 @@ entry: define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_laneq_s32: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -813,7 +713,6 @@ entry: define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_s16: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -825,7 +724,6 @@ entry: define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_s32: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -837,7 +735,6 @@ entry: define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_s16: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -849,7 +746,6 @@ entry: define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_s32: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -861,7 +757,6 @@ entry: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_lane_u16: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -872,7 +767,6 @@ entry: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_lane_u32: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -883,7 +777,6 @@ entry: define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_laneq_u16: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -894,7 +787,6 @@ entry: define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_laneq_u32: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -905,7 +797,6 @@ entry: define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_lane_u16: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -917,7 +808,6 @@ entry: define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_lane_u32: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -929,7 +819,6 @@ entry: define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_u16: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -941,7 +830,6 @@ entry: define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_u32: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -953,7 +841,6 @@ entry: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_lane_u16: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -964,7 +851,6 @@ entry: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_lane_u32: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -975,7 +861,6 @@ entry: define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_laneq_u16: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -986,7 +871,6 @@ entry: define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_laneq_u32: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -997,7 +881,6 @@ entry: define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_u16: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -1009,7 +892,6 @@ entry: define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_u32: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -1021,7 +903,6 @@ entry: define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_u16: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -1033,7 +914,6 @@ entry: define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_u32: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -1045,7 +925,6 @@ entry: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_lane_s16: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1055,7 +934,6 @@ entry: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_lane_s32: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1065,7 +943,6 @@ entry: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_lane_u16: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1075,7 +952,6 @@ entry: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_lane_u32: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1085,7 +961,6 @@ entry: define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_high_lane_s16: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -1096,7 +971,6 @@ entry: define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_high_lane_s32: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -1107,7 +981,6 @@ entry: define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_high_lane_u16: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -1118,7 +991,6 @@ entry: define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_high_lane_u32: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -1129,7 +1001,6 @@ entry: define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_laneq_s16: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1139,7 +1010,6 @@ entry: define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_laneq_s32: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1149,7 +1019,6 @@ entry: define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_laneq_u16: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1159,7 +1028,6 @@ entry: define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_laneq_u32: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1169,7 +1037,6 @@ entry: define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_high_laneq_s16: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -1180,7 +1047,6 @@ entry: define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_high_laneq_s32: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -1191,7 +1057,6 @@ entry: define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_high_laneq_u16: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -1202,7 +1067,6 @@ entry: define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_high_laneq_u32: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -1213,7 +1077,6 @@ entry: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlal_lane_s16: ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1224,7 +1087,6 @@ entry: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlal_lane_s32: ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1235,7 +1097,6 @@ entry: define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlal_high_lane_s16: ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -1247,7 +1108,6 @@ entry: define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlal_high_lane_s32: ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -1259,7 +1119,6 @@ entry: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlsl_lane_s16: ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1270,7 +1129,6 @@ entry: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlsl_lane_s32: ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1281,7 +1139,6 @@ entry: define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlsl_high_lane_s16: ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -1293,7 +1150,6 @@ entry: define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlsl_high_lane_s32: ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -1305,7 +1161,6 @@ entry: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmull_lane_s16: ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1315,7 +1170,6 @@ entry: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmull_lane_s32: ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1325,7 +1179,6 @@ entry: define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vqdmull_laneq_s16: ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1335,7 +1188,6 @@ entry: define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vqdmull_laneq_s32: ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1345,7 +1197,6 @@ entry: define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmull_high_lane_s16: ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -1356,7 +1207,6 @@ entry: define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmull_high_lane_s32: ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -1367,7 +1217,6 @@ entry: define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vqdmull_high_laneq_s16: ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -1378,7 +1227,6 @@ entry: define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vqdmull_high_laneq_s32: ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -1389,7 +1237,6 @@ entry: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmulh_lane_s16: ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -1399,7 +1246,6 @@ entry: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmulhq_lane_s16: ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -1409,7 +1255,6 @@ entry: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmulh_lane_s32: ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -1419,7 +1264,6 @@ entry: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmulhq_lane_s32: ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -1429,7 +1273,6 @@ entry: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqrdmulh_lane_s16: ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -1439,7 +1282,6 @@ entry: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqrdmulhq_lane_s16: ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -1449,7 +1291,6 @@ entry: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqrdmulh_lane_s32: ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -1459,7 +1300,6 @@ entry: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqrdmulhq_lane_s32: ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -1468,12 +1308,9 @@ entry: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmul_lane_f32: -; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1> %mul = fmul <2 x float> %shuffle, %a @@ -1483,10 +1320,6 @@ entry: define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) { ; CHECK-LABEL: test_vmul_lane_f64: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_lane_f64: -; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -1498,12 +1331,9 @@ entry: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmulq_lane_f32: -; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %mul = fmul <4 x float> %shuffle, %a @@ -1512,12 +1342,9 @@ entry: define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) { ; CHECK-LABEL: test_vmulq_lane_f64: -; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_lane_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -1526,12 +1353,9 @@ entry: define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmul_laneq_f32: -; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] -; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3> %mul = fmul <2 x float> %shuffle, %a @@ -1541,10 +1365,6 @@ entry: define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmul_laneq_f64: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_laneq_f64: -; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -1556,12 +1376,9 @@ entry: define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmulq_laneq_f32: -; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %mul = fmul <4 x float> %shuffle, %a @@ -1570,12 +1387,9 @@ entry: define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmulq_laneq_f64: -; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_laneq_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] -; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1> %mul = fmul <2 x double> %shuffle, %a @@ -1584,12 +1398,9 @@ entry: define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmulx_lane_f32: -; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulx_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1598,12 +1409,9 @@ entry: define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmulxq_lane_f32: -; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_lane_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; Exynos-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1612,12 +1420,9 @@ entry: define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) { ; CHECK-LABEL: test_vmulxq_lane_f64: -; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_lane_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1626,12 +1431,9 @@ entry: define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmulx_laneq_f32: -; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulx_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] -; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1640,12 +1442,9 @@ entry: define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmulxq_laneq_f32: -; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_laneq_f32: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1654,12 +1453,9 @@ entry: define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmulxq_laneq_f64: -; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_laneq_f64: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] -; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1> %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1669,7 +1465,6 @@ entry: define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmla_lane_s16_0: ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -1680,7 +1475,6 @@ entry: define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlaq_lane_s16_0: ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %b @@ -1691,7 +1485,6 @@ entry: define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmla_lane_s32_0: ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -1702,7 +1495,6 @@ entry: define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlaq_lane_s32_0: ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %b @@ -1713,7 +1505,6 @@ entry: define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmla_laneq_s16_0: ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -1724,7 +1515,6 @@ entry: define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlaq_laneq_s16_0: ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %b @@ -1735,7 +1525,6 @@ entry: define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmla_laneq_s32_0: ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -1746,7 +1535,6 @@ entry: define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlaq_laneq_s32_0: ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %b @@ -1757,7 +1545,6 @@ entry: define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmls_lane_s16_0: ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -1768,7 +1555,6 @@ entry: define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsq_lane_s16_0: ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %b @@ -1779,7 +1565,6 @@ entry: define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmls_lane_s32_0: ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -1790,7 +1575,6 @@ entry: define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsq_lane_s32_0: ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %b @@ -1801,7 +1585,6 @@ entry: define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmls_laneq_s16_0: ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -1812,7 +1595,6 @@ entry: define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsq_laneq_s16_0: ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %b @@ -1823,7 +1605,6 @@ entry: define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmls_laneq_s32_0: ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -1834,7 +1615,6 @@ entry: define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsq_laneq_s32_0: ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %b @@ -1845,7 +1625,6 @@ entry: define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmul_lane_s16_0: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -1855,7 +1634,6 @@ entry: define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmulq_lane_s16_0: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %a @@ -1865,7 +1643,6 @@ entry: define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmul_lane_s32_0: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -1875,7 +1652,6 @@ entry: define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmulq_lane_s32_0: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %a @@ -1885,7 +1661,6 @@ entry: define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmul_lane_u16_0: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -1895,7 +1670,6 @@ entry: define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmulq_lane_u16_0: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %a @@ -1905,7 +1679,6 @@ entry: define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmul_lane_u32_0: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -1915,7 +1688,6 @@ entry: define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmulq_lane_u32_0: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %a @@ -1925,7 +1697,6 @@ entry: define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmul_laneq_s16_0: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -1935,7 +1706,6 @@ entry: define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmulq_laneq_s16_0: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %a @@ -1945,7 +1715,6 @@ entry: define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmul_laneq_s32_0: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -1955,7 +1724,6 @@ entry: define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmulq_laneq_s32_0: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %a @@ -1965,7 +1733,6 @@ entry: define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmul_laneq_u16_0: ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -1975,7 +1742,6 @@ entry: define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmulq_laneq_u16_0: ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer %mul = mul <8 x i16> %shuffle, %a @@ -1985,7 +1751,6 @@ entry: define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmul_laneq_u32_0: ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -1995,7 +1760,6 @@ entry: define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmulq_laneq_u32_0: ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer %mul = mul <4 x i32> %shuffle, %a @@ -2004,12 +1768,9 @@ entry: define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfma_lane_f32_0: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfma_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -2018,12 +1779,9 @@ entry: define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmaq_lane_f32_0: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -2032,12 +1790,9 @@ entry: define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfma_laneq_f32_0: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfma_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -2046,12 +1801,9 @@ entry: define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmaq_laneq_f32_0: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -2060,12 +1812,9 @@ entry: define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfms_lane_f32_0: -; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfms_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer @@ -2075,12 +1824,9 @@ entry: define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) { ; CHECK-LABEL: test_vfmsq_lane_f32_0: -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer @@ -2090,12 +1836,9 @@ entry: define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfms_laneq_f32_0: -; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfms_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer @@ -2105,12 +1848,9 @@ entry: define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) { ; CHECK-LABEL: test_vfmsq_laneq_f32_0: -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer @@ -2120,12 +1860,9 @@ entry: define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmaq_laneq_f64_0: -; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_laneq_f64_0: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -2134,12 +1871,9 @@ entry: define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) { ; CHECK-LABEL: test_vfmsq_laneq_f64_0: -; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_laneq_f64_0: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer @@ -2150,7 +1884,6 @@ entry: define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_lane_s16_0: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2161,7 +1894,6 @@ entry: define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_lane_s32_0: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2172,7 +1904,6 @@ entry: define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_laneq_s16_0: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2183,7 +1914,6 @@ entry: define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_laneq_s32_0: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2194,7 +1924,6 @@ entry: define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_lane_s16_0: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2206,7 +1935,6 @@ entry: define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_lane_s32_0: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2218,7 +1946,6 @@ entry: define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_s16_0: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2230,7 +1957,6 @@ entry: define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_s32_0: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2242,7 +1968,6 @@ entry: define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_lane_s16_0: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2253,7 +1978,6 @@ entry: define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_lane_s32_0: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2264,7 +1988,6 @@ entry: define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_laneq_s16_0: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2275,7 +1998,6 @@ entry: define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_laneq_s32_0: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2286,7 +2008,6 @@ entry: define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_s16_0: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2298,7 +2019,6 @@ entry: define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_s32_0: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2310,7 +2030,6 @@ entry: define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_s16_0: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2322,7 +2041,6 @@ entry: define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_s32_0: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2334,7 +2052,6 @@ entry: define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_lane_u16_0: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2345,7 +2062,6 @@ entry: define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_lane_u32_0: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2356,7 +2072,6 @@ entry: define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_laneq_u16_0: ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2367,7 +2082,6 @@ entry: define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_laneq_u32_0: ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2378,7 +2092,6 @@ entry: define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_lane_u16_0: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2390,7 +2103,6 @@ entry: define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_lane_u32_0: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2402,7 +2114,6 @@ entry: define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_u16_0: ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2414,7 +2125,6 @@ entry: define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlal_high_laneq_u32_0: ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2426,7 +2136,6 @@ entry: define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_lane_u16_0: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2437,7 +2146,6 @@ entry: define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_lane_u32_0: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2448,7 +2156,6 @@ entry: define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_laneq_u16_0: ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2459,7 +2166,6 @@ entry: define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_laneq_u32_0: ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2470,7 +2176,6 @@ entry: define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_u16_0: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2482,7 +2187,6 @@ entry: define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_lane_u32_0: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2494,7 +2198,6 @@ entry: define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_u16_0: ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2506,7 +2209,6 @@ entry: define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) { ; CHECK-LABEL: test_vmlsl_high_laneq_u32_0: ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2518,7 +2220,6 @@ entry: define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_lane_s16_0: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2528,7 +2229,6 @@ entry: define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_lane_s32_0: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2538,7 +2238,6 @@ entry: define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_lane_u16_0: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2548,7 +2247,6 @@ entry: define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_lane_u32_0: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2558,7 +2256,6 @@ entry: define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_high_lane_s16_0: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2569,7 +2266,6 @@ entry: define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_high_lane_s32_0: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2580,7 +2276,6 @@ entry: define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vmull_high_lane_u16_0: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2591,7 +2286,6 @@ entry: define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vmull_high_lane_u32_0: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2602,7 +2296,6 @@ entry: define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_laneq_s16_0: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2612,7 +2305,6 @@ entry: define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_laneq_s32_0: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2622,7 +2314,6 @@ entry: define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_laneq_u16_0: ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2632,7 +2323,6 @@ entry: define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_laneq_u32_0: ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2642,7 +2332,6 @@ entry: define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_high_laneq_s16_0: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2653,7 +2342,6 @@ entry: define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_high_laneq_s32_0: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2664,7 +2352,6 @@ entry: define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vmull_high_laneq_u16_0: ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2675,7 +2362,6 @@ entry: define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vmull_high_laneq_u32_0: ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2686,7 +2372,6 @@ entry: define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlal_lane_s16_0: ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2697,7 +2382,6 @@ entry: define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlal_lane_s32_0: ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2708,7 +2392,6 @@ entry: define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlal_high_lane_s16_0: ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2720,7 +2403,6 @@ entry: define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlal_high_lane_s32_0: ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2732,7 +2414,6 @@ entry: define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlsl_lane_s16_0: ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2743,7 +2424,6 @@ entry: define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlsl_lane_s32_0: ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2754,7 +2434,6 @@ entry: define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0: ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2766,7 +2445,6 @@ entry: define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0: ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2778,7 +2456,6 @@ entry: define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmull_lane_s16_0: ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2788,7 +2465,6 @@ entry: define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmull_lane_s32_0: ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2798,7 +2474,6 @@ entry: define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vqdmull_laneq_s16_0: ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2808,7 +2483,6 @@ entry: define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vqdmull_laneq_s32_0: ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2818,7 +2492,6 @@ entry: define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmull_high_lane_s16_0: ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2829,7 +2502,6 @@ entry: define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmull_high_lane_s32_0: ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2840,7 +2512,6 @@ entry: define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK-LABEL: test_vqdmull_high_laneq_s16_0: ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2851,7 +2522,6 @@ entry: define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK-LABEL: test_vqdmull_high_laneq_s32_0: ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2862,7 +2532,6 @@ entry: define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmulh_lane_s16_0: ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -2872,7 +2541,6 @@ entry: define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqdmulhq_lane_s16_0: ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -2882,7 +2550,6 @@ entry: define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmulh_lane_s32_0: ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -2892,7 +2559,6 @@ entry: define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqdmulhq_lane_s32_0: ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -2902,7 +2568,6 @@ entry: define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqrdmulh_lane_s16_0: ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -2912,7 +2577,6 @@ entry: define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) { ; CHECK-LABEL: test_vqrdmulhq_lane_s16_0: ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -2922,7 +2586,6 @@ entry: define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqrdmulh_lane_s32_0: ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -2932,7 +2595,6 @@ entry: define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) { ; CHECK-LABEL: test_vqrdmulhq_lane_s32_0: ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret entry: %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -2941,12 +2603,9 @@ entry: define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmul_lane_f32_0: -; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2955,12 +2614,9 @@ entry: define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmulq_lane_f32_0: -; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -2969,12 +2625,9 @@ entry: define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmul_laneq_f32_0: -; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -2984,10 +2637,6 @@ entry: define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmul_laneq_f64_0: ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_laneq_f64_0: -; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] -; EXYNOS-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> %1 = bitcast <8 x i8> %0 to double @@ -2999,12 +2648,9 @@ entry: define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmulq_laneq_f32_0: -; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %mul = fmul <4 x float> %shuffle, %a @@ -3013,12 +2659,9 @@ entry: define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmulq_laneq_f64_0: -; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_laneq_f64_0: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %mul = fmul <2 x double> %shuffle, %a @@ -3027,12 +2670,9 @@ entry: define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmulx_lane_f32_0: -; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulx_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -3041,12 +2681,9 @@ entry: define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) { ; CHECK-LABEL: test_vmulxq_lane_f32_0: -; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_lane_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -3055,12 +2692,9 @@ entry: define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) { ; CHECK-LABEL: test_vmulxq_lane_f64_0: -; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_lane_f64_0: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -3069,12 +2703,9 @@ entry: define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmulx_laneq_f32_0: -; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulx_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -3083,12 +2714,9 @@ entry: define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) { ; CHECK-LABEL: test_vmulxq_laneq_f32_0: -; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_laneq_f32_0: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -3097,12 +2725,9 @@ entry: define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmulxq_laneq_f64_0: -; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_laneq_f64_0: -; EXYNOS: dup [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d entry: %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -3111,14 +2736,11 @@ entry: define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) { ; CHECK-LABEL: optimize_dup: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: optimize_dup: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s entry: %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a) @@ -3130,15 +2752,12 @@ entry: define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) { ; CHECK-LABEL: no_optimize_dup: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: no_optimize_dup: -; EXYNOS: dup [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS: dup [[y:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[y]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM1: dup [[W:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[W]].4s entry: %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a) @@ -3150,8 +2769,7 @@ entry: define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_a57(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="cortex-a57" { ; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -3160,9 +2778,8 @@ entry: define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m1(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m1" { ; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1: -; CHECK: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; CHECK-NEXT: ret +; GENERIC: dup [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s entry: %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll index 453334dce601..2fb9d3b2d030 100644 --- a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll +++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll @@ -87,4 +87,13 @@ for.end: ret double %v0 } +define <2 x i64> @t6() { +; ALL-LABEL: t6: +; CYCLONE: movi.16b v0, #0 +; KRYO: movi v0.2d, #0000000000000000 +; FALKOR: movi v0.2d, #0000000000000000 + ret <2 x i64> zeroinitializer +} + + declare double @sin(double) diff --git a/test/CodeGen/AArch64/chkstk.ll b/test/CodeGen/AArch64/chkstk.ll new file mode 100644 index 000000000000..1c2e5528f10c --- /dev/null +++ b/test/CodeGen/AArch64/chkstk.ll @@ -0,0 +1,25 @@ +; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs %s -o - \ +; RUN: | FileCheck -check-prefix CHECK-DEFAULT-CODE-MODEL %s + +; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs -code-model=large %s -o - \ +; RUN: | FileCheck -check-prefix CHECK-LARGE-CODE-MODEL %s + +define void @check_watermark() { +entry: + %buffer = alloca [4096 x i8], align 1 + ret void +} + +; CHECK-DEFAULT-CODE-MODEL: check_watermark: +; CHECK-DEFAULT-CODE-MODEL-DAG: stp x29, x30, [sp +; CHECK-DEFAULT-CODE-MODEL-DAG: orr x15, xzr, #0x100 +; CHECK-DEFAULT-CODE-MODEL: bl __chkstk +; CHECK-DEFAULT-CODE-MODEL: sub sp, sp, x15, lsl #4 + +; CHECK-LARGE-CODE-MODEL: check_watermark: +; CHECK-LARGE-CODE-MODEL-DAG: stp x29, x30, [sp +; CHECK-LARGE-CODE-MODEL-DAG: orr x15, xzr, #0x100 +; CHECK-LARGE-CODE-MODEL-DAG: adrp x16, __chkstk +; CHECK-LARGE-CODE-MODEL-DAG: add x16, x16, __chkstk +; CHECK-LARGE-CODE-MODEL: blr x16 +; CHECK-LARGE-CODE-MODEL: sub sp, sp, x15, lsl #4 diff --git a/test/CodeGen/AArch64/ldst-paired-aliasing.ll b/test/CodeGen/AArch64/ldst-paired-aliasing.ll index 9c698b5fdcc6..9b0b51d369a3 100644 --- a/test/CodeGen/AArch64/ldst-paired-aliasing.ll +++ b/test/CodeGen/AArch64/ldst-paired-aliasing.ll @@ -10,11 +10,10 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3 define i32 @main() local_unnamed_addr #1 { ; Make sure the stores happen in the correct order (the exact instructions could change). ; CHECK-LABEL: main: -; CHECK: stp xzr, xzr, [sp, #72] +; CHECK: str xzr, [sp, #80] ; CHECK: str w9, [sp, #80] -; CHECK: str q0, [sp, #48] +; CHECK: stp q0, q0, [sp, #48] ; CHECK: ldr w8, [sp, #48] -; CHECK: str q0, [sp, #64] for.body.lr.ph.i.i.i.i.i.i63: %b1 = alloca [10 x i32], align 16 diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index 228d3c7d4306..71c4c83c28f9 100644 --- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -251,7 +251,8 @@ entry: ; R600: MOVA_INT -; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: +; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: +; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0 diff --git a/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll b/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll new file mode 100644 index 000000000000..f97785beab6f --- /dev/null +++ b/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll @@ -0,0 +1,32 @@ +; RUN: llc -mtriple=amdgcn--amdhsa-amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Effectively, check that the compile finishes; in the case +; of an infinite loop, llc toggles between merging 2 ST4s +; ( MergeConsecutiveStores() ) and breaking the resulting ST8 +; apart ( LegalizeStoreOps() ). + +target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" + +; GCN-LABEL: {{^}}_Z6brokenPd: +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} +define amdgpu_kernel void @_Z6brokenPd(double* %arg) { +bb: + %tmp = alloca double, align 8, addrspace(5) + %tmp1 = alloca double, align 8, addrspace(5) + %tmp2 = load double, double* %arg, align 8 + br i1 1, label %bb6, label %bb4 + +bb3: ; No predecessors! + br label %bb4 + +bb4: ; preds = %bb3, %bb + %tmp5 = phi double addrspace(5)* [ %tmp1, %bb3 ], [ %tmp, %bb ] + store double %tmp2, double addrspace(5)* %tmp5, align 8 + br label %bb6 + +bb6: ; preds = %bb4, %bb + %tmp7 = phi double [ 0x7FF8123000000000, %bb4 ], [ 0x7FF8000000000000, %bb ] + store double %tmp7, double* %arg, align 8 + ret void +} diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir index 7c2666e3680f..8b9b83f6d0e7 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir @@ -6,6 +6,7 @@ define void @test_trunc_and_zext_s16() { ret void } define void @test_trunc_and_anyext_s8() { ret void } define void @test_trunc_and_anyext_s16() { ret void } + define void @test_trunc_s64() #0 { ret void } define void @test_add_s32() { ret void } define void @test_add_fold_imm_s32() { ret void } @@ -46,6 +47,10 @@ define void @test_gep() { ret void } define void @test_constant_imm() { ret void } define void @test_constant_cimm() { ret void } + define void @test_pointer_constant() { ret void } + + define void @test_inttoptr_s32() { ret void } + define void @test_ptrtoint_s32() { ret void } define void @test_select_s32() { ret void } define void @test_select_ptr() { ret void } @@ -241,6 +246,36 @@ body: | ; CHECK: BX_RET 14, %noreg, implicit %r0 ... --- +name: test_trunc_s64 +# CHECK-LABEL: name: test_trunc_s64 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: fprb } + - { id: 1, class: gprb } + - { id: 2, class: gprb } +body: | + bb.0: + liveins: %r0, %d0 + + %0(s64) = COPY %d0 + ; CHECK: [[VREG:%[0-9]+]]:dpr = COPY %d0 + + %2(p0) = COPY %r0 + ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY %r0 + + %1(s32) = G_TRUNC %0(s64) + ; CHECK: [[VREGTRUNC:%[0-9]+]]:gpr, [[UNINTERESTING:%[0-9]+]]:gpr = VMOVRRD [[VREG]] + + G_STORE %1(s32), %2 :: (store 4) + ; CHECK: STRi12 [[VREGTRUNC]], [[PTR]], 0, 14, %noreg + + BX_RET 14, %noreg + ; CHECK: BX_RET 14, %noreg +... +--- name: test_add_s32 # CHECK-LABEL: name: test_add_s32 legalized: true @@ -1075,6 +1110,71 @@ body: | BX_RET 14, %noreg, implicit %r0 ... --- +name: test_pointer_constant +# CHECK-LABEL: name: test_pointer_constant +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } +body: | + bb.0: + %0(p0) = G_CONSTANT i32 0 + ; CHECK: %[[C:[0-9]+]]:gpr = MOVi 0, 14, %noreg, %noreg + + %r0 = COPY %0(p0) + BX_RET 14, %noreg, implicit %r0 +... +--- +name: test_inttoptr_s32 +# CHECK-LABEL: name: test_inttoptr_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: %r0 + + %0(s32) = COPY %r0 + %1(p0) = G_INTTOPTR %0(s32) + ; CHECK: [[INT:%[0-9]+]]:gpr = COPY %r0 + ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY [[INT]] + + %r0 = COPY %1(p0) + ; CHECK: %r0 = COPY [[PTR]] + + BX_RET 14, %noreg, implicit %r0 +... +--- +name: test_ptrtoint_s32 +# CHECK-LABEL: name: test_ptrtoint_s32 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: %r0 + + %0(p0) = COPY %r0 + %1(s32) = G_PTRTOINT %0(p0) + ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY %r0 + ; CHECK: [[INT:%[0-9]+]]:gpr = COPY [[PTR]] + + %r0 = COPY %1(s32) + ; CHECK: %r0 = COPY [[INT]] + + BX_RET 14, %noreg, implicit %r0 +... +--- name: test_select_s32 # CHECK-LABEL: name: test_select_s32 legalized: true diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir index e3e206cf76e9..204434e981b4 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir @@ -3,6 +3,9 @@ define void @test_sext_s8() { ret void } define void @test_zext_s16() { ret void } + define void @test_inttoptr_s32() { ret void } + define void @test_ptrtoint_s32() { ret void } + define void @test_add_s8() { ret void } define void @test_add_s16() { ret void } define void @test_add_s32() { ret void } @@ -101,6 +104,50 @@ body: | BX_RET 14, %noreg, implicit %r0 ... --- +name: test_inttoptr_s32 +# CHECK-LABEL: name: test_inttoptr_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: %r0 + + %0(s32) = COPY %r0 + %1(p0) = G_INTTOPTR %0(s32) + ; G_INTTOPTR with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(p0) = G_INTTOPTR {{%[0-9]+}} + %r0 = COPY %1(p0) + BX_RET 14, %noreg, implicit %r0 +... +--- +name: test_ptrtoint_s32 +# CHECK-LABEL: name: test_ptrtoint_s32 +legalized: false +# CHECK: legalized: true +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: %r0 + + %0(p0) = COPY %r0 + %1(s32) = G_PTRTOINT %0(p0) + ; G_PTRTOINT with s32 is legal, so we should find it unchanged in the output + ; CHECK: {{%[0-9]+}}:_(s32) = G_PTRTOINT {{%[0-9]+}} + %r0 = COPY %1(s32) + BX_RET 14, %noreg, implicit %r0 +... +--- name: test_add_s8 # CHECK-LABEL: name: test_add_s8 legalized: false @@ -826,6 +873,7 @@ registers: - { id: 2, class: _ } - { id: 3, class: _ } - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0 @@ -856,6 +904,10 @@ body: | ; CHECK: {{%[0-9]+}}:_(s1) = G_TRUNC [[EXT]](s32) ; CHECK-NOT: G_CONSTANT i1 + %5(p0) = G_CONSTANT 0 + G_STORE %5(p0), %4(p0) :: (store 4) + ; CHECK: {{%[0-9]+}}:_(p0) = G_CONSTANT 0 + %r0 = COPY %0(s32) BX_RET 14, %noreg, implicit %r0 ... diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir index 044740e33a2d..175333626f97 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir @@ -24,6 +24,9 @@ define void @test_constants() { ret void } + define void @test_inttoptr_s32() { ret void } + define void @test_ptrtoint_s32() { ret void } + @a_global = global float 1.0 define void @test_globals() { ret void } @@ -31,6 +34,7 @@ define void @test_anyext_s16_32() { ret void } define void @test_trunc_s32_16() { ret void } + define void @test_trunc_s64_32() #0 { ret void } define void @test_icmp_eq_s32() { ret void } define void @test_fcmp_one_s32() #0 { ret void } @@ -496,6 +500,44 @@ body: | BX_RET 14, %noreg, implicit %r0 ... --- +name: test_inttoptr_s32 +# CHECK-LABEL: name: test_inttoptr_s32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb, preferred-register: '' } +# CHECK: - { id: 1, class: gprb, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + %0(s32) = COPY %r0 + %1(p0) = G_INTTOPTR %0(s32) + %r0 = COPY %1(p0) + BX_RET 14, %noreg, implicit %r0 +... +--- +name: test_ptrtoint_s32 +# CHECK-LABEL: name: test_ptrtoint_s32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb, preferred-register: '' } +# CHECK: - { id: 1, class: gprb, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + %0(p0) = COPY %r0 + %1(s32) = G_PTRTOINT %0(p0) + %r0 = COPY %1(s32) + BX_RET 14, %noreg, implicit %r0 +... +--- name: test_globals # CHECK-LABEL: name: test_globals legalized: true @@ -584,6 +626,30 @@ body: | BX_RET 14, %noreg ... --- +name: test_trunc_s64_32 +# CHECK-LABEL: name: test_trunc_s64_32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: fprb, preferred-register: '' } +# CHECK: - { id: 1, class: gprb, preferred-register: '' } +# CHECK: - { id: 2, class: gprb, preferred-register: '' } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %r0, %d0 + + %0(s64) = COPY %d0 + %2(p0) = COPY %r0 + %1(s32) = G_TRUNC %0(s64) + G_STORE %1(s32), %2 :: (store 4) + BX_RET 14, %noreg +... +--- name: test_icmp_eq_s32 # CHECK-LABEL: name: test_icmp_eq_s32 legalized: true diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll index 78d3ebf371a4..9373c5d44210 100644 --- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll +++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 -simplifycfg-sink-common=false | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift -simplifycfg-sink-common=false | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT ; Avoid some 's' 16-bit instruction which partially update CPSR (and add false ; dependency) when it isn't dependent on last CPSR defining instruction. ; rdar://8928208 diff --git a/test/CodeGen/ARM/su-addsub-overflow.ll b/test/CodeGen/ARM/su-addsub-overflow.ll new file mode 100644 index 000000000000..eef531282033 --- /dev/null +++ b/test/CodeGen/ARM/su-addsub-overflow.ll @@ -0,0 +1,135 @@ +; RUN: llc < %s -mtriple=arm-eabi -mcpu=generic | FileCheck %s + +define i32 @sadd(i32 %a, i32 %b) local_unnamed_addr #0 { +; CHECK-LABEL: sadd: +; CHECK: mov r[[R0:[0-9]+]], r0 +; CHECK-NEXT: add r[[R1:[0-9]+]], r[[R0]], r1 +; CHECK-NEXT: cmp r[[R1]], r[[R0]] +; CHECK-NEXT: movvc pc, lr +entry: + %0 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) + %1 = extractvalue { i32, i1 } %0, 1 + br i1 %1, label %trap, label %cont + +trap: + tail call void @llvm.trap() #2 + unreachable + +cont: + %2 = extractvalue { i32, i1 } %0, 0 + ret i32 %2 + +} + +define i32 @uadd(i32 %a, i32 %b) local_unnamed_addr #0 { +; CHECK-LABEL: uadd: +; CHECK: mov r[[R0:[0-9]+]], r0 +; CHECK-NEXT: adds r[[R1:[0-9]+]], r[[R0]], r1 +; CHECK-NEXT: cmp r[[R1]], r[[R0]] +; CHECK-NEXT: movhs pc, lr +entry: + %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) + %1 = extractvalue { i32, i1 } %0, 1 + br i1 %1, label %trap, label %cont + +trap: + tail call void @llvm.trap() #2 + unreachable + +cont: + %2 = extractvalue { i32, i1 } %0, 0 + ret i32 %2 + +} + +define i32 @ssub(i32 %a, i32 %b) local_unnamed_addr #0 { +; CHECK-LABEL: ssub: +; CHECK: cmp r0, r1 +; CHECK-NEXT: subvc r0, r0, r1 +; CHECK-NEXT: movvc pc, lr +entry: + %0 = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) + %1 = extractvalue { i32, i1 } %0, 1 + br i1 %1, label %trap, label %cont + +trap: + tail call void @llvm.trap() #2 + unreachable + +cont: + %2 = extractvalue { i32, i1 } %0, 0 + ret i32 %2 + +} + +define i32 @usub(i32 %a, i32 %b) local_unnamed_addr #0 { +; CHECK-LABEL: usub: +; CHECK: mov r[[R0:[0-9]+]], r0 +; CHECK-NEXT: subs r[[R1:[0-9]+]], r[[R0]], r1 +; CHECK-NEXT: cmp r[[R0]], r1 +; CHECK-NEXT: movhs pc, lr +entry: + %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) + %1 = extractvalue { i32, i1 } %0, 1 + br i1 %1, label %trap, label %cont + +trap: + tail call void @llvm.trap() #2 + unreachable + +cont: + %2 = extractvalue { i32, i1 } %0, 0 + ret i32 %2 + +} + +define void @sum(i32* %a, i32* %b, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: sum: +; CHECK: ldr [[R0:r[0-9]+]], +; CHECK-NEXT: ldr [[R1:r[0-9]+|lr]], +; CHECK-NEXT: add [[R2:r[0-9]+]], [[R1]], [[R0]] +; CHECK-NEXT: cmp [[R2]], [[R1]] +; CHECK-NEXT: strvc [[R2]], +; CHECK-NEXT: addvc +; CHECK-NEXT: cmpvc +; CHECK-NEXT: bvs +entry: + %cmp7 = icmp eq i32 %n, 0 + br i1 %cmp7, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.08 = phi i32 [ %7, %cont2 ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.08 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.08 + %1 = load i32, i32* %arrayidx1, align 4 + %2 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %1, i32 %0) + %3 = extractvalue { i32, i1 } %2, 1 + br i1 %3, label %trap, label %cont + +trap: + tail call void @llvm.trap() #2 + unreachable + +cont: + %4 = extractvalue { i32, i1 } %2, 0 + store i32 %4, i32* %arrayidx1, align 4 + %5 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %i.08, i32 1) + %6 = extractvalue { i32, i1 } %5, 1 + br i1 %6, label %trap, label %cont2 + +cont2: + %7 = extractvalue { i32, i1 } %5, 0 + %cmp = icmp eq i32 %7, %n + br i1 %cmp, label %for.cond.cleanup, label %for.body + +} + +declare void @llvm.trap() #2 +declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) #1 +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 +declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) #1 +declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1 diff --git a/test/CodeGen/ARM/usat.ll b/test/CodeGen/ARM/usat.ll new file mode 100644 index 000000000000..8f19d11ef7bb --- /dev/null +++ b/test/CodeGen/ARM/usat.ll @@ -0,0 +1,214 @@ +; RUN: llc -mtriple=armv4t-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V4T +; RUN: llc -mtriple=armv6-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V6 +; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V6T2 + +; Check for several conditions that should result in USAT. +; For example, the base test is equivalent to +; x < 0 ? 0 : (x > k ? k : x) in C. All patterns that bound x +; to the interval [0, k] where k + 1 is a power of 2 can be +; transformed into USAT. At the end there are some tests +; checking that conditionals are not transformed if they don't +; match the right pattern. + +; +; Base tests with different bit widths +; + +; x < 0 ? 0 : (x > k ? k : x) +; 32-bit base test +define i32 @unsigned_sat_base_32bit(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_base_32bit: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: + %cmpLow = icmp slt i32 %x, 0 + %cmpUp = icmp sgt i32 %x, 8388607 + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x + %saturateLow = select i1 %cmpLow, i32 0, i32 %saturateUp + ret i32 %saturateLow +} + +; x < 0 ? 0 : (x > k ? k : x) +; 16-bit base test +define i16 @unsigned_sat_base_16bit(i16 %x) #0 { +; CHECK-LABEL: unsigned_sat_base_16bit: +; V6: usat r0, #11, r0 +; V6T2: usat r0, #11, r0 +; V4T-NOT: usat +entry: + %cmpLow = icmp slt i16 %x, 0 + %cmpUp = icmp sgt i16 %x, 2047 + %saturateUp = select i1 %cmpUp, i16 2047, i16 %x + %saturateLow = select i1 %cmpLow, i16 0, i16 %saturateUp + ret i16 %saturateLow +} + +; x < 0 ? 0 : (x > k ? k : x) +; 8-bit base test +define i8 @unsigned_sat_base_8bit(i8 %x) #0 { +; CHECK-LABEL: unsigned_sat_base_8bit: +; V6: usat r0, #5, r0 +; V6T2: usat r0, #5, r0 +; V4T-NOT: usat +entry: + %cmpLow = icmp slt i8 %x, 0 + %cmpUp = icmp sgt i8 %x, 31 + %saturateUp = select i1 %cmpUp, i8 31, i8 %x + %saturateLow = select i1 %cmpLow, i8 0, i8 %saturateUp + ret i8 %saturateLow +} + +; +; Tests where the conditionals that check for upper and lower bounds, +; or the < and > operators, are arranged in different ways. Only some +; of the possible combinations that lead to USAT are tested. +; +; x < 0 ? 0 : (x < k ? x : k) +define i32 @unsigned_sat_lower_upper_1(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_lower_upper_1: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: + %cmpLow = icmp slt i32 %x, 0 + %cmpUp = icmp slt i32 %x, 8388607 + %saturateUp = select i1 %cmpUp, i32 %x, i32 8388607 + %saturateLow = select i1 %cmpLow, i32 0, i32 %saturateUp + ret i32 %saturateLow +} + +; x > 0 ? (x > k ? k : x) : 0 +define i32 @unsigned_sat_lower_upper_2(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_lower_upper_2: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: + %cmpLow = icmp sgt i32 %x, 0 + %cmpUp = icmp sgt i32 %x, 8388607 + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x + %saturateLow = select i1 %cmpLow, i32 %saturateUp, i32 0 + ret i32 %saturateLow +} + +; x < k ? (x < 0 ? 0 : x) : k +define i32 @unsigned_sat_upper_lower_1(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_upper_lower_1: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: + %cmpUp = icmp slt i32 %x, 8388607 + %cmpLow = icmp slt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 0, i32 %x + %saturateUp = select i1 %cmpUp, i32 %saturateLow, i32 8388607 + ret i32 %saturateUp +} + +; x > k ? k : (x < 0 ? 0 : x) +define i32 @unsigned_sat_upper_lower_2(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_upper_lower_2: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: + %cmpUp = icmp sgt i32 %x, 8388607 + %cmpLow = icmp slt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 0, i32 %x + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; k < x ? k : (x > 0 ? x : 0) +define i32 @unsigned_sat_upper_lower_3(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_upper_lower_3: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: + %cmpUp = icmp slt i32 8388607, %x + %cmpLow = icmp sgt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 %x, i32 0 + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; +; The following tests check for patterns that should not transform +; into USAT but are similar enough that could confuse the selector. +; +; x > k ? k : (x > 0 ? 0 : x) +; First condition upper-saturates, second doesn't lower-saturate. +define i32 @no_unsigned_sat_missing_lower(i32 %x) #0 { +; CHECK-LABEL: no_unsigned_sat_missing_lower +; CHECK-NOT: usat +entry: + %cmpUp = icmp sgt i32 %x, 8388607 + %cmpLow = icmp sgt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 0, i32 %x + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; x < k ? k : (x < 0 ? 0 : x) +; Second condition lower-saturates, first doesn't upper-saturate. +define i32 @no_unsigned_sat_missing_upper(i32 %x) #0 { +; CHECK-LABEL: no_unsigned_sat_missing_upper: +; CHECK-NOT: usat +entry: + %cmpUp = icmp slt i32 %x, 8388607 + %cmpLow = icmp slt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 0, i32 %x + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; Lower constant is different in the select and in the compare +define i32 @no_unsigned_sat_incorrect_constant(i32 %x) #0 { +; CHECK-LABEL: no_unsigned_sat_incorrect_constant: +; CHECK-NOT: usat +entry: + %cmpUp = icmp sgt i32 %x, 8388607 + %cmpLow = icmp slt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 -1, i32 %x + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; The interval is not [0, k] +define i32 @no_unsigned_sat_incorrect_interval(i32 %x) #0 { +; CHECK-LABEL: no_unsigned_sat_incorrect_interval: +; CHECK-NOT: usat +entry: + %cmpUp = icmp sgt i32 %x, 8388607 + %cmpLow = icmp slt i32 %x, -4 + %saturateLow = select i1 %cmpLow, i32 -4, i32 %x + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; The returned value (y) is not the same as the tested value (x). +define i32 @no_unsigned_sat_incorrect_return(i32 %x, i32 %y) #0 { +; CHECK-LABEL: no_unsigned_sat_incorrect_return: +; CHECK-NOT: usat +entry: + %cmpUp = icmp sgt i32 %x, 8388607 + %cmpLow = icmp slt i32 %x, 0 + %saturateLow = select i1 %cmpLow, i32 0, i32 %y + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} + +; One of the values in a compare (y) is not the same as the rest +; of the compare and select values (x). +define i32 @no_unsigned_sat_incorrect_compare(i32 %x, i32 %y) #0 { +; CHECK-LABEL: no_unsigned_sat_incorrect_compare: +; CHECK-NOT: usat +entry: + %cmpUp = icmp sgt i32 %x, 8388607 + %cmpLow = icmp slt i32 %y, 0 + %saturateLow = select i1 %cmpLow, i32 0, i32 %x + %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow + ret i32 %saturateUp +} diff --git a/test/CodeGen/BPF/objdump_imm_hex.ll b/test/CodeGen/BPF/objdump_imm_hex.ll new file mode 100644 index 000000000000..a245a6c791f2 --- /dev/null +++ b/test/CodeGen/BPF/objdump_imm_hex.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=bpfel -filetype=obj -o - %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-DEC %s +; RUN: llc -march=bpfel -filetype=obj -o - %s | llvm-objdump -d -print-imm-hex - | FileCheck --check-prefix=CHECK-HEX %s + +; Source Code: +; int gbl; +; int test(unsigned long long a, unsigned long long b) { +; int ret = 0; +; if (a == 0xABCDABCDabcdabcdULL) { +; gbl = gbl * gbl * 2; +; ret = 1; +; goto out; +; } +; if (b == 0xABCDabcdabcdULL) { +; gbl = gbl * 4; +; ret = 2; +; } +; out: +; return ret; +; } + +@gbl = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: norecurse nounwind +define i32 @test(i64, i64) local_unnamed_addr #0 { +; CHECK-LABEL: test + %3 = icmp eq i64 %0, -6067004223159161907 + br i1 %3, label %4, label %8 +; CHECK-DEC: 18 03 00 00 cd ab cd ab 00 00 00 00 cd ab cd ab r3 = -6067004223159161907 ll +; CHECK-DEC: 5d 31 07 00 00 00 00 00 if r1 != r3 goto +7 +; CHECK-HEX: 18 03 00 00 cd ab cd ab 00 00 00 00 cd ab cd ab r3 = -0x5432543254325433 ll +; CHECK-HEX: 5d 31 07 00 00 00 00 00 if r1 != r3 goto +0x7 + +; <label>:4: ; preds = %2 + %5 = load i32, i32* @gbl, align 4 + %6 = shl i32 %5, 1 +; CHECK-DEC: 67 01 00 00 01 00 00 00 r1 <<= 1 +; CHECK-HEX: 67 01 00 00 01 00 00 00 r1 <<= 0x1 + %7 = mul i32 %6, %5 + br label %13 + +; <label>:8: ; preds = %2 + %9 = icmp eq i64 %1, 188899839028173 +; CHECK-DEC: 18 01 00 00 cd ab cd ab 00 00 00 00 cd ab 00 00 r1 = 188899839028173 ll +; CHECK-HEX: 18 01 00 00 cd ab cd ab 00 00 00 00 cd ab 00 00 r1 = 0xabcdabcdabcd ll + br i1 %9, label %10, label %16 + +; <label>:10: ; preds = %8 + %11 = load i32, i32* @gbl, align 4 + %12 = shl nsw i32 %11, 2 + br label %13 + +; <label>:13: ; preds = %4, %10 + %14 = phi i32 [ %12, %10 ], [ %7, %4 ] + %15 = phi i32 [ 2, %10 ], [ 1, %4 ] + store i32 %14, i32* @gbl, align 4 +; CHECK-DEC: 63 12 00 00 00 00 00 00 *(u32 *)(r2 + 0) = r1 +; CHECK-HEX: 63 12 00 00 00 00 00 00 *(u32 *)(r2 + 0x0) = r1 + br label %16 + +; <label>:16: ; preds = %13, %8 + %17 = phi i32 [ 0, %8 ], [ %15, %13 ] + ret i32 %17 +} + +attributes #0 = { norecurse nounwind } diff --git a/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll b/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll new file mode 100644 index 000000000000..f96dbf2af496 --- /dev/null +++ b/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll @@ -0,0 +1,19 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; Check that this doesn't crash. +; CHECK: sfcmp + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define void @fred() #0 { +b0: + %v1 = load <16 x float>, <16 x float>* null, align 8 + %v2 = fcmp olt <16 x float> undef, %v1 + %v3 = select <16 x i1> %v2, <16 x i16> undef, <16 x i16> zeroinitializer + %v4 = sext <16 x i16> %v3 to <16 x i32> + store <16 x i32> %v4, <16 x i32>* undef, align 64 + unreachable +} + +attributes #0 = { noinline norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx-length64b,+hvxv60" } diff --git a/test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll b/test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll new file mode 100644 index 000000000000..4cbd00837fc6 --- /dev/null +++ b/test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; Check that this testcase doesn't crash. +; CHECK: sfcmp + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define void @fred() #0 { +b0: + %v1 = fcmp olt <16 x float> zeroinitializer, undef + %v2 = select <16 x i1> %v1, <16 x i16> undef, <16 x i16> zeroinitializer + %v3 = sext <16 x i16> %v2 to <16 x i32> + store <16 x i32> %v3, <16 x i32>* undef, align 128 + unreachable +} + +attributes #0 = { noinline norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx-length64b" } diff --git a/test/CodeGen/Hexagon/autohvx/isel-select-const.ll b/test/CodeGen/Hexagon/autohvx/isel-select-const.ll new file mode 100644 index 000000000000..c251292c9da4 --- /dev/null +++ b/test/CodeGen/Hexagon/autohvx/isel-select-const.ll @@ -0,0 +1,32 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; Check that this doesn't crash. +; CHECK: vlut32 + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon-unknown--elf" + +define void @fred() #0 { +b0: + %v1 = tail call <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32> undef, <16 x i32> <i32 151388928, i32 353505036, i32 555621144, i32 757737252, i32 959853360, i32 1161969468, i32 1364085576, i32 1566201684, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> undef, i32 3) + %v2 = bitcast <16 x i32> %v1 to <64 x i8> + %v3 = shufflevector <64 x i8> %v2, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + %v4 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %v3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> + %v5 = bitcast <64 x i8> %v4 to <16 x i32> + %v6 = tail call <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32> %v5) + store <16 x i32> %v6, <16 x i32>* undef, align 1 + %v7 = tail call <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32> undef, <16 x i32> <i32 151388928, i32 353505036, i32 555621144, i32 757737252, i32 959853360, i32 1161969468, i32 1364085576, i32 1566201684, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> zeroinitializer, i32 3) + %v8 = bitcast <16 x i32> %v7 to <64 x i8> + %v9 = shufflevector <64 x i8> %v8, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + %v10 = shufflevector <32 x i8> %v9, <32 x i8> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> + %v11 = bitcast <64 x i8> %v10 to <16 x i32> + %v12 = tail call <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32> %v11) + store <16 x i32> %v12, <16 x i32>* undef, align 1 + unreachable +} + +declare <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32>) #1 +declare <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1 + +attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-length64b" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/Hexagon/expand-vstorerw-undef.ll b/test/CodeGen/Hexagon/expand-vstorerw-undef.ll index 88eaec938fd3..5ac0f59bd2d1 100644 --- a/test/CodeGen/Hexagon/expand-vstorerw-undef.ll +++ b/test/CodeGen/Hexagon/expand-vstorerw-undef.ll @@ -12,7 +12,7 @@ ; CHECK-LABEL: fred: ; CHECK: v[[REG:[0-9]+]] = vsplat -; CHECK: vmem(r29+#6) = v[[REG]] +; CHECK: vmem(r29+#{{[0-9]+}}) = v[[REG]] target triple = "hexagon" diff --git a/test/CodeGen/Hexagon/v60-cur.ll b/test/CodeGen/Hexagon/v60-cur.ll index 26d40c9a6975..d0ffe1d8fdd8 100644 --- a/test/CodeGen/Hexagon/v60-cur.ll +++ b/test/CodeGen/Hexagon/v60-cur.ll @@ -1,9 +1,8 @@ -; RUN: llc -march=hexagon -enable-pipeliner=false < %s | FileCheck %s +; RUN: llc -march=hexagon < %s | FileCheck %s ; Test that we generate a .cur -; CHECK: v{{[0-9]*}}.cur{{ *}} -; CHECK: v{{[0-9]*}}.cur{{ *}} +; CHECK: v{{[0-9]*}}.cur define void @conv3x3_i(i8* noalias nocapture readonly %iptr0, i32 %shift, i32 %width) #0 { entry: diff --git a/test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll b/test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll new file mode 100644 index 000000000000..af2a55ea47d5 --- /dev/null +++ b/test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=hexagon -debug-only=isel < %s 2>/dev/null +; REQUIRES: asserts + +; Make sure that this doesn't crash. Debug option enabled a failing assertion +; about type mismatch in formal arguments. +; CHECK: vaddub + +define i1 @t_i4x8(<4 x i8> %a, <4 x i8> %b) nounwind { +entry: + %0 = add <4 x i8> %a, %b + %1 = bitcast <4 x i8> %0 to <32 x i1> + %2 = extractelement <32 x i1> %1, i32 0 + ret i1 %2 +} diff --git a/test/CodeGen/Hexagon/vect/vect-infloop.ll b/test/CodeGen/Hexagon/vect/vect-infloop.ll index 4de390159fdd..9ee0b0ab3aa6 100644 --- a/test/CodeGen/Hexagon/vect/vect-infloop.ll +++ b/test/CodeGen/Hexagon/vect/vect-infloop.ll @@ -1,10 +1,10 @@ ; Extracted from test/CodeGen/Generic/vector-casts.ll: used to loop indefinitely. ; RUN: llc -march=hexagon < %s | FileCheck %s -; CHECK: combine +; CHECK: convert_df2w define void @a(<2 x double>* %p, <2 x i8>* %q) { - %t = load <2 x double>, <2 x double>* %p - %r = fptosi <2 x double> %t to <2 x i8> - store <2 x i8> %r, <2 x i8>* %q - ret void + %t = load <2 x double>, <2 x double>* %p + %r = fptosi <2 x double> %t to <2 x i8> + store <2 x i8> %r, <2 x i8>* %q + ret void } diff --git a/test/CodeGen/Mips/llvm-ir/extractelement.ll b/test/CodeGen/Mips/llvm-ir/extractelement.ll index f7b8ea5f9e15..4f926cbee0b2 100644 --- a/test/CodeGen/Mips/llvm-ir/extractelement.ll +++ b/test/CodeGen/Mips/llvm-ir/extractelement.ll @@ -15,5 +15,5 @@ define i1 @via_stack_bug(i8 signext %idx) { ; ALL-DAG: sh [[ONE]], 6($sp) ; ALL-DAG: andi [[MASKED_IDX:\$[0-9]+]], $4, 1 ; ALL-DAG: addiu [[VPTR:\$[0-9]+]], $sp, 6 -; ALL-DAG: or [[EPTR:\$[0-9]+]], [[MASKED_IDX]], [[VPTR]] +; ALL-DAG: or [[EPTR:\$[0-9]+]], [[VPTR]], [[MASKED_IDX]] ; ALL: lbu $2, 0([[EPTR]]) diff --git a/test/CodeGen/Mips/long-call-mcount.ll b/test/CodeGen/Mips/long-call-mcount.ll new file mode 100644 index 000000000000..70a4410d060b --- /dev/null +++ b/test/CodeGen/Mips/long-call-mcount.ll @@ -0,0 +1,19 @@ +; Check call to mcount in case of long/short call options. +; RUN: llc -march=mips -target-abi o32 --mattr=+long-calls,+noabicalls < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,LONG %s +; RUN: llc -march=mips -target-abi o32 --mattr=-long-calls,+noabicalls < %s \ +; RUN: | FileCheck -check-prefixes=CHECK,SHORT %s + +; Function Attrs: noinline nounwind optnone +define void @foo() #0 { +entry: + ret void + +; CHECK-LABEL: foo +; LONG: lui $1, %hi(_mcount) +; LONG-NEXT: addiu $25, $1, %lo(_mcount) +; LONG-NEXT: jalr $25 +; SHORT: jal _mcount +} + +attributes #0 = { "instrument-function-entry-inlined"="_mcount" } diff --git a/test/CodeGen/Mips/sll-micromips-r6-encoding.mir b/test/CodeGen/Mips/sll-micromips-r6-encoding.mir new file mode 100644 index 000000000000..85ce251ac315 --- /dev/null +++ b/test/CodeGen/Mips/sll-micromips-r6-encoding.mir @@ -0,0 +1,46 @@ +# RUN: llc -march=mips -mcpu=mips32r6 -mattr=+micromips %s -start-after=xray-instrumentation -o - -show-mc-encoding | FileCheck %s + +# Test that the 'sll $zero, $zero, 0' is correctly recognized as a real +# instruction rather than some unimplemented opcode for the purposes of +# encoding an instruction. + +# CHECK-LABEL: a: +# CHECK: nop # encoding: [0x00,0x00,0x00,0x00] +# CHECK: jrc $ra # encoding: [0x45,0xbf] +--- +name: a +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: false +registers: +liveins: + - { reg: '%a0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: +body: | + bb.0.entry: + renamable %zero = SLL_MMR6 killed renamable %zero, 0 + JRC16_MM undef %ra, implicit %v0 + +... diff --git a/test/CodeGen/PowerPC/cmp_elimination.ll b/test/CodeGen/PowerPC/cmp_elimination.ll index 3251ae2881b9..6bc8b8a041c2 100644 --- a/test/CodeGen/PowerPC/cmp_elimination.ll +++ b/test/CodeGen/PowerPC/cmp_elimination.ll @@ -1,4 +1,3 @@ -; XFAIL: * ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s @@ -748,6 +747,37 @@ do.end: ret void } +define void @func29(i32 signext %a) { +; We cannot merge two compares due to difference in sign extension behaviors. +; equivalent C code example: +; int a = .. ; +; if (a == -1) dummy1(); +; if (a == (uint16_t)-1) dummy2(); + +; CHECK-LABEL: @func29 +; CHECK: cmp +; CHECK: cmp +; CHECK: blr +entry: + %cmp = icmp eq i32 %a, -1 + br i1 %cmp, label %if.then, label %if.else + +if.then: + tail call void @dummy1() + br label %if.end3 + +if.else: + %cmp1 = icmp eq i32 %a, 65535 + br i1 %cmp1, label %if.then2, label %if.end3 + +if.then2: + tail call void @dummy2() + br label %if.end3 + +if.end3: + ret void +} + declare void @dummy1() declare void @dummy2() declare void @dummy3() diff --git a/test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll b/test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll new file mode 100644 index 000000000000..ad8dd90ea920 --- /dev/null +++ b/test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll @@ -0,0 +1,15 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr9 \ +; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s + +; Ensure we don't crash by trying to convert directly from a subword load +; to a ppc_fp128 as we do for conversions to f32/f64. +define ppc_fp128 @test(i16* nocapture readonly %Ptr) { +entry: + %0 = load i16, i16* %Ptr, align 2 + %conv = uitofp i16 %0 to ppc_fp128 + ret ppc_fp128 %conv +; CHECK: lhz [[LD:[0-9]+]], 0(3) +; CHECK: mtvsrwa [[MV:[0-9]+]], [[LD]] +; CHECK: xscvsxddp [[CONV:[0-9]+]], [[MV]] +; CHECK: bl __gcc_qadd +} diff --git a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll index 82c6c318abdc..247961e85b12 100644 --- a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll +++ b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll @@ -25,7 +25,7 @@ entry: ; CHECK: extsw 3, [[RSHREG]] ; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29 ; CHECK-P7-DAG: stxvw4x 34, -; CHECK-P7: lwax 3, [[ELEMOFFREG]], +; CHECK-P7: lwax 3, 3, [[ELEMOFFREG]] ; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 2 ; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 2 ; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]] @@ -54,7 +54,7 @@ entry: ; CHECK: mfvsrd 3, ; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 3, 28, 28 ; CHECK-P7-DAG: stxvd2x 34, -; CHECK-P7: ldx 3, [[ELEMOFFREG]], +; CHECK-P7: ldx 3, 3, [[ELEMOFFREG]] ; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1 ; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 3 ; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]] @@ -77,7 +77,7 @@ entry: ; CHECK: xscvspdpn 1, ; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29 ; CHECK-P7-DAG: stxvw4x 34, -; CHECK-P7: lfsx 1, [[ELEMOFFREG]], +; CHECK-P7: lfsx 1, 3, [[ELEMOFFREG]] ; CHECK-BE: sldi [[ELNOREG:[0-9]+]], 5, 2 ; CHECK-BE: lvsl [[SHMSKREG:[0-9]+]], 0, [[ELNOREG]] ; CHECK-BE: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]] diff --git a/test/CodeGen/Thumb2/t2sizereduction.mir b/test/CodeGen/Thumb2/t2sizereduction.mir new file mode 100644 index 000000000000..377c0ccc7b0a --- /dev/null +++ b/test/CodeGen/Thumb2/t2sizereduction.mir @@ -0,0 +1,83 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=t2-reduce-size %s -o - | FileCheck %s + +--- | + target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8m.main-arm-none-eabi" + + ; Function Attrs: norecurse nounwind readnone + define i32 @test(i32 %x, i32 %y) local_unnamed_addr #0 { + entry: + %cmp6 = icmp sgt i32 %y, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + %sum.0.lcssa = phi i32 [ 1, %entry ], [ %mul, %for.body ] + ret i32 %sum.0.lcssa + + for.body: ; preds = %for.body, %for.body.preheader + %lsr.iv1 = phi i32 [ %lsr.iv.next2, %for.body ], [ %x, %for.body.preheader ] + %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %y, %for.body.preheader ] + %sum.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ] + %mul = mul nsw i32 %lsr.iv1, %sum.07 + %lsr.iv.next = add i32 %lsr.iv, -1 + %lsr.iv.next2 = add i32 %lsr.iv1, 1 + %exitcond = icmp eq i32 %lsr.iv.next, 0 + br i1 %exitcond, label %for.cond.cleanup, label %for.body + } + + attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="+d16,+dsp,+fp-armv8,+fp-only-sp,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" } + +... +--- +name: test +tracksRegLiveness: true +liveins: + - { reg: '%r0', virtual-reg: '' } + - { reg: '%r1', virtual-reg: '' } +body: | + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: liveins: %r0, %r1 + ; CHECK: %r2 = tMOVr %r0, 14, %noreg + ; CHECK: %r0, dead %cpsr = tMOVi8 1, 14, %noreg + ; CHECK: tCMPi8 %r1, 1, 14, %noreg, implicit-def %cpsr + ; CHECK: t2Bcc %bb.2, 11, killed %cpsr + ; CHECK: bb.1.for.body: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: %r0, %r1, %r2 + ; CHECK: %r0, dead %cpsr = tMUL %r2, killed %r0, 14, %noreg + ; CHECK: %r2, dead %cpsr = tADDi8 killed %r2, 1, 14, %noreg + ; CHECK: %r1, %cpsr = tSUBi8 killed %r1, 1, 14, %noreg + ; CHECK: t2Bcc %bb.1, 1, killed %cpsr + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: liveins: %r0 + ; CHECK: tBX_RET 14, %noreg, implicit %r0 + bb.0.entry: + successors: %bb.1.for.body, %bb.2.for.cond.cleanup + liveins: %r0, %r1 + + %r2 = tMOVr %r0, 14, _ + %r0 = t2MOVi 1, 14, _, _ + t2CMPri %r1, 1, 14, _, implicit-def %cpsr + t2Bcc %bb.2.for.cond.cleanup, 11, killed %cpsr + + bb.1.for.body: + successors: %bb.2.for.cond.cleanup, %bb.1.for.body + liveins: %r0, %r1, %r2 + + %r0 = t2MUL %r2, killed %r0, 14, _ + %r2 = t2ADDri killed %r2, 1, 14, _, _ + %r1 = t2SUBri killed %r1, 1, 14, _, def %cpsr + t2Bcc %bb.1.for.body, 1, killed %cpsr + + bb.2.for.cond.cleanup: + liveins: %r0 + + tBX_RET 14, _, implicit %r0 + +... diff --git a/test/CodeGen/X86/avg-mask.ll b/test/CodeGen/X86/avg-mask.ll index 578d7aa75287..d32b0e70791a 100644 --- a/test/CodeGen/X86/avg-mask.ll +++ b/test/CodeGen/X86/avg-mask.ll @@ -143,16 +143,8 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 ; AVX512F-NEXT: shrq $32, %rax ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: movl %edi, (%rsp) -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm7 -; AVX512F-NEXT: vpavgb %xmm7, %xmm6, %xmm6 -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512F-NEXT: vpavgb %xmm7, %xmm8, %xmm7 -; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1 -; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} @@ -201,16 +193,8 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin ; AVX512F-NEXT: shrq $32, %rax ; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: movl %edi, (%rsp) -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512F-NEXT: vpavgb %xmm6, %xmm4, %xmm4 -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512F-NEXT: vpavgb %xmm6, %xmm5, %xmm5 -; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index d1e26b787f48..dd11f6ca2935 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -90,150 +90,23 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind { define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v32i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm3 -; SSE2-NEXT: movdqa 16(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: paddd %xmm6, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: paddd %xmm12, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: paddd %xmm11, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: paddd %xmm10, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm8, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: psubd %xmm4, %xmm9 -; SSE2-NEXT: psubd %xmm4, %xmm2 -; SSE2-NEXT: psubd %xmm4, %xmm5 -; SSE2-NEXT: psubd %xmm4, %xmm0 -; SSE2-NEXT: psubd %xmm4, %xmm6 -; SSE2-NEXT: psubd %xmm4, %xmm3 -; SSE2-NEXT: psubd %xmm4, %xmm7 -; SSE2-NEXT: psubd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm9, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm5, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm7, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: pavgb (%rdi), %xmm1 +; SSE2-NEXT: pavgb 16(%rsi), %xmm0 ; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm9 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm9 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm0, %xmm7, %xmm7 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm2 -; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpand %xmm0, %xmm6, %xmm3 -; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -265,444 +138,467 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind { ret void } -define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind { -; SSE2-LABEL: avg_v64i8: +define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { +; SSE2-LABEL: avg_v48i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm6 -; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: movdqa 32(%rdi), %xmm1 -; SSE2-NEXT: movdqa 48(%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm6 +; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rsi), %xmm12 ; SSE2-NEXT: movdqa 16(%rsi), %xmm13 -; SSE2-NEXT: movdqa 32(%rsi), %xmm11 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm6, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm2, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm10, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: paddd %xmm4, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: paddd %xmm12, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE2-NEXT: paddd %xmm6, %xmm5 -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm13, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm4, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: paddd %xmm14, %xmm12 -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: paddd %xmm15, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm13, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT: paddd %xmm8, %xmm15 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE2-NEXT: paddd %xmm2, %xmm13 -; SSE2-NEXT: movdqa %xmm11, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm6, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: paddd %xmm5, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: paddd %xmm7, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm11, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: paddd %xmm2, %xmm14 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE2-NEXT: paddd %xmm1, %xmm11 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa 48(%rsi), %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: movdqa 32(%rsi), %xmm0 +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; SSE2-NEXT: movdqa %xmm4, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; SSE2-NEXT: movdqa %xmm5, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE2-NEXT: movdqa %xmm6, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: movdqa %xmm12, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] ; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] -; SSE2-NEXT: paddd %xmm1, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: paddd %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: paddd %xmm1, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: paddd %xmm2, %xmm7 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: psubd %xmm0, %xmm10 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: psubd %xmm0, %xmm12 -; SSE2-NEXT: psubd %xmm0, %xmm4 -; SSE2-NEXT: psubd %xmm0, %xmm15 -; SSE2-NEXT: psubd %xmm0, %xmm13 -; SSE2-NEXT: psubd %xmm0, %xmm9 -; SSE2-NEXT: psubd %xmm0, %xmm6 -; SSE2-NEXT: psubd %xmm0, %xmm14 -; SSE2-NEXT: psubd %xmm0, %xmm11 -; SSE2-NEXT: psubd %xmm0, %xmm8 -; SSE2-NEXT: psubd %xmm0, %xmm3 -; SSE2-NEXT: psubd %xmm0, %xmm5 -; SSE2-NEXT: psubd %xmm0, %xmm7 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: packuswb %xmm1, %xmm10 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm1, %xmm2 -; SSE2-NEXT: packuswb %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE2-NEXT: paddd %xmm2, %xmm8 +; SSE2-NEXT: movdqa %xmm11, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; SSE2-NEXT: movdqa %xmm12, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; SSE2-NEXT: paddd %xmm10, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: movdqa %xmm13, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; SSE2-NEXT: movdqa %xmm4, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE2-NEXT: paddd %xmm15, %xmm10 +; SSE2-NEXT: movdqa %xmm2, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE2-NEXT: paddd %xmm5, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] +; SSE2-NEXT: movdqa %xmm13, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT: paddd %xmm14, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; SSE2-NEXT: paddd %xmm6, %xmm13 +; SSE2-NEXT: movdqa %xmm0, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; SSE2-NEXT: movdqa %xmm6, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] +; SSE2-NEXT: paddd %xmm15, %xmm14 +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: paddd %xmm2, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE2-NEXT: paddd %xmm11, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 +; SSE2-NEXT: psubd %xmm5, %xmm8 +; SSE2-NEXT: psubd %xmm5, %xmm3 +; SSE2-NEXT: psubd %xmm5, %xmm9 +; SSE2-NEXT: psubd %xmm5, %xmm12 +; SSE2-NEXT: psubd %xmm5, %xmm10 +; SSE2-NEXT: psubd %xmm5, %xmm4 +; SSE2-NEXT: psubd %xmm5, %xmm1 +; SSE2-NEXT: psubd %xmm5, %xmm13 +; SSE2-NEXT: psubd %xmm5, %xmm14 +; SSE2-NEXT: psubd %xmm5, %xmm6 +; SSE2-NEXT: psubd %xmm5, %xmm2 +; SSE2-NEXT: psubd %xmm5, %xmm0 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm8 +; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255] +; SSE2-NEXT: pand %xmm7, %xmm8 +; SSE2-NEXT: pand %xmm7, %xmm3 +; SSE2-NEXT: packuswb %xmm8, %xmm3 ; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm12, %xmm4 +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: pand %xmm7, %xmm9 +; SSE2-NEXT: pand %xmm7, %xmm12 +; SSE2-NEXT: packuswb %xmm9, %xmm12 +; SSE2-NEXT: packuswb %xmm3, %xmm12 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm10 +; SSE2-NEXT: pand %xmm7, %xmm10 +; SSE2-NEXT: pand %xmm7, %xmm4 +; SSE2-NEXT: packuswb %xmm10, %xmm4 ; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: psrld $1, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm13 -; SSE2-NEXT: packuswb %xmm15, %xmm13 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: pand %xmm7, %xmm1 +; SSE2-NEXT: pand %xmm7, %xmm13 +; SSE2-NEXT: packuswb %xmm1, %xmm13 ; SSE2-NEXT: packuswb %xmm4, %xmm13 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm9, %xmm6 -; SSE2-NEXT: psrld $1, %xmm11 ; SSE2-NEXT: psrld $1, %xmm14 -; SSE2-NEXT: pand %xmm0, %xmm14 -; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: packuswb %xmm14, %xmm11 -; SSE2-NEXT: packuswb %xmm6, %xmm11 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: pand %xmm0, %xmm8 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: packuswb %xmm8, %xmm3 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: packuswb %xmm5, %xmm7 -; SSE2-NEXT: packuswb %xmm3, %xmm7 -; SSE2-NEXT: movdqu %xmm7, (%rax) -; SSE2-NEXT: movdqu %xmm11, (%rax) +; SSE2-NEXT: pand %xmm7, %xmm14 +; SSE2-NEXT: pand %xmm7, %xmm6 +; SSE2-NEXT: packuswb %xmm14, %xmm6 +; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: pand %xmm7, %xmm2 +; SSE2-NEXT: pand %xmm7, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm6, %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: movdqu %xmm13, (%rax) -; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm12, (%rax) ; SSE2-NEXT: retq ; -; AVX1-LABEL: avg_v64i8: +; AVX1-LABEL: avg_v48i8: ; AVX1: # %bb.0: -; AVX1-NEXT: subq $24, %rsp -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm13 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm12 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm4, %xmm15, %xmm11 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm10 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm8 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm9 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm5, %xmm2 # 16-byte Folded Reload -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm14 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm15 -; AVX1-NEXT: vmovdqa %xmm15, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpsubd %xmm0, %xmm13, %xmm13 -; AVX1-NEXT: vpsubd %xmm0, %xmm12, %xmm12 -; AVX1-NEXT: vpsubd %xmm0, %xmm11, %xmm11 -; AVX1-NEXT: vpsubd %xmm0, %xmm10, %xmm10 -; AVX1-NEXT: vpsubd %xmm0, %xmm8, %xmm8 -; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm9 -; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm7 -; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa (%rdi), %ymm2 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX1-NEXT: vmovdqa (%rsi), %ymm1 +; AVX1-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; AVX1-NEXT: vmovdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm14, %xmm14 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm5, %xmm14, %xmm14 -; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm14, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm6 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm13, %xmm2 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm11, %xmm6 -; AVX1-NEXT: vpsrld $1, %xmm12, %xmm7 -; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpsrld $1, %xmm8, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm10, %xmm6 -; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vmovdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vmovdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm9 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm8 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm2, %xmm11, %xmm11 +; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm7, %xmm12, %xmm12 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm5, %xmm13, %xmm13 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm15 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm4, %xmm14, %xmm14 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm6, %xmm10, %xmm6 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: vpcmpeqd %xmm7, %xmm7, %xmm7 +; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vpsubd %xmm7, %xmm1, %xmm10 +; AVX1-NEXT: vpsubd %xmm7, %xmm9, %xmm9 +; AVX1-NEXT: vpsubd %xmm7, %xmm8, %xmm8 +; AVX1-NEXT: vpsubd %xmm7, %xmm11, %xmm11 +; AVX1-NEXT: vpsubd %xmm7, %xmm12, %xmm12 +; AVX1-NEXT: vpsubd %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm7, %xmm13, %xmm4 +; AVX1-NEXT: vpsubd %xmm7, %xmm15, %xmm5 +; AVX1-NEXT: vpsubd %xmm7, %xmm14, %xmm1 +; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpsubd %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT: vpsrld $1, %xmm2, %xmm14 +; AVX1-NEXT: vpsrld $1, %xmm6, %xmm15 +; AVX1-NEXT: vpsrld $1, %xmm1, %xmm13 +; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5 ; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm9, %xmm6 -; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 +; AVX1-NEXT: vpsrld $1, %xmm12, %xmm12 +; AVX1-NEXT: vpsrld $1, %xmm11, %xmm11 +; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7 +; AVX1-NEXT: vpsrld $1, %xmm9, %xmm2 +; AVX1-NEXT: vpsrld $1, %xmm10, %xmm6 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm7[0],xmm1[0] +; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm3 +; AVX1-NEXT: vpshufb %xmm0, %xmm5, %xmm4 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm2 +; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm3 ; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vmovdqu %xmm0, (%rax) ; AVX1-NEXT: vmovups %ymm1, (%rax) -; AVX1-NEXT: addq $24, %rsp ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: avg_v64i8: +; AVX2-LABEL: avg_v48i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7 -; AVX2-NEXT: vpcmpeqd %ymm8, %ymm8, %ymm8 -; AVX2-NEXT: vpsubd %ymm8, %ymm0, %ymm9 -; AVX2-NEXT: vpsubd %ymm8, %ymm1, %ymm10 -; AVX2-NEXT: vpsubd %ymm8, %ymm2, %ymm2 -; AVX2-NEXT: vpsubd %ymm8, %ymm3, %ymm3 -; AVX2-NEXT: vpsubd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vpsubd %ymm8, %ymm5, %ymm5 -; AVX2-NEXT: vpsubd %ymm8, %ymm6, %ymm1 -; AVX2-NEXT: vpsubd %ymm8, %ymm7, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm11 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm12 +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT: vpand %ymm9, %ymm5, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[3,1,2,3] +; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm9, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm4, %ymm7, %ymm4 +; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 +; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 +; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 +; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm6 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm7 -; AVX2-NEXT: vpsrld $1, %ymm10, %ymm8 -; AVX2-NEXT: vpsrld $1, %ymm9, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm0 -; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX2-NEXT: vpshufb %ymm2, %ymm12, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT: vmovdqu %xmm0, (%rax) ; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: avg_v48i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512F-NEXT: vpavgb %xmm5, %xmm4, %xmm4 +; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqu %xmm1, (%rax) +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: avg_v48i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512BW-NEXT: vpaddd %zmm4, %zmm2, %zmm2 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero +; AVX512BW-NEXT: vpaddd %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsubd %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsubd %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrld $1, %zmm3, %zmm1 +; AVX512BW-NEXT: vpsrld $1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqu %ymm1, (%rax) +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %1 = load <48 x i8>, <48 x i8>* %a + %2 = load <48 x i8>, <48 x i8>* %b + %3 = zext <48 x i8> %1 to <48 x i32> + %4 = zext <48 x i8> %2 to <48 x i32> + %5 = add nuw nsw <48 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %6 = add nuw nsw <48 x i32> %5, %4 + %7 = lshr <48 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %8 = trunc <48 x i32> %7 to <48 x i8> + store <48 x i8> %8, <48 x i8>* undef, align 4 + ret void +} + +define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind { +; SSE2-LABEL: avg_v64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa 32(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 +; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: pavgb (%rdi), %xmm1 +; SSE2-NEXT: pavgb 16(%rdi), %xmm2 +; SSE2-NEXT: pavgb 32(%rsi), %xmm0 +; SSE2-NEXT: pavgb 48(%rdi), %xmm3 +; SSE2-NEXT: movdqu %xmm3, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX1-LABEL: avg_v64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vmovdqa (%rsi), %ymm2 +; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpavgb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovups %ymm1, (%rax) +; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-NEXT: vpavgb (%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3 -; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 -; AVX512F-NEXT: vpsubd %zmm4, %zmm0, %zmm0 -; AVX512F-NEXT: vpsubd %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vpsubd %zmm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpsubd %zmm4, %zmm3, %zmm3 -; AVX512F-NEXT: vpsrld $1, %zmm3, %zmm3 -; AVX512F-NEXT: vpsrld $1, %zmm2, %zmm2 -; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1 -; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm2, %xmm1 -; AVX512F-NEXT: vpmovdb %zmm3, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vpavgb (%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vpavgb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -782,81 +678,23 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind { define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v16i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm2 -; SSE2-NEXT: movdqa 16(%rdi), %xmm4 -; SSE2-NEXT: movdqa (%rsi), %xmm0 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: psubd %xmm4, %xmm3 -; SSE2-NEXT: psubd %xmm4, %xmm0 -; SSE2-NEXT: psubd %xmm4, %xmm2 -; SSE2-NEXT: psubd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm3, %xmm0 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: pavgw (%rdi), %xmm1 +; SSE2-NEXT: pavgw 16(%rsi), %xmm0 ; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpavgw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -891,207 +729,60 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind { define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v32i16: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 -; SSE2-NEXT: movdqa 32(%rdi), %xmm10 -; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm9 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm8, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: paddd %xmm4, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: paddd %xmm5, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: paddd %xmm12, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: paddd %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm7 -; SSE2-NEXT: psubd %xmm0, %xmm9 -; SSE2-NEXT: psubd %xmm0, %xmm6 -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: psubd %xmm0, %xmm5 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: psubd %xmm0, %xmm4 -; SSE2-NEXT: psubd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm9 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: pslld $16, %xmm7 -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm9 -; SSE2-NEXT: psrad $16, %xmm9 -; SSE2-NEXT: packssdw %xmm7, %xmm9 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: pslld $16, %xmm4 -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: packssdw %xmm4, %xmm3 +; SSE2-NEXT: pavgw (%rdi), %xmm1 +; SSE2-NEXT: pavgw 16(%rdi), %xmm2 +; SSE2-NEXT: pavgw 32(%rsi), %xmm0 +; SSE2-NEXT: pavgw 48(%rdi), %xmm3 ; SSE2-NEXT: movdqu %xmm3, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm9, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm9 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm9 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7 -; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX1-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] -; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4],xmm0[5],xmm6[6],xmm0[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2],xmm0[3],xmm9[4],xmm0[5],xmm9[6],xmm0[7] -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vmovdqa (%rsi), %ymm2 +; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpavgw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT: vpavgw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpavgw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovups %ymm1, (%rax) +; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vpsubd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpsubd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpsubd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-NEXT: vpavgw (%rdi), %ymm1, %ymm1 +; AVX2-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpsubd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpsubd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1 -; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, (%rax) -; AVX512F-NEXT: vpmovdw %zmm1, (%rax) +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vpavgw (%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vpavgw 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1199,150 +890,23 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind { define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v32i8_2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm3 -; SSE2-NEXT: movdqa 16(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm8, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm7, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: paddd %xmm6, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: paddd %xmm12, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: paddd %xmm11, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: paddd %xmm10, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm8, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: psubd %xmm4, %xmm9 -; SSE2-NEXT: psubd %xmm4, %xmm2 -; SSE2-NEXT: psubd %xmm4, %xmm5 -; SSE2-NEXT: psubd %xmm4, %xmm0 -; SSE2-NEXT: psubd %xmm4, %xmm6 -; SSE2-NEXT: psubd %xmm4, %xmm3 -; SSE2-NEXT: psubd %xmm4, %xmm7 -; SSE2-NEXT: psubd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm9 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm9, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm5, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm6, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm7, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: pavgb (%rsi), %xmm0 +; SSE2-NEXT: pavgb 16(%rdi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i8_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm9 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm9 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm0, %xmm7, %xmm7 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm2 -; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpand %xmm0, %xmm6, %xmm3 -; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1377,249 +941,31 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind { define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v64i8_2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rsi), %xmm14 -; SSE2-NEXT: movdqa 16(%rsi), %xmm12 +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm14, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm12, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm6, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm5, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: paddd %xmm1, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm4, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm3, %xmm3 -; SSE2-NEXT: paddd %xmm2, %xmm2 -; SSE2-NEXT: paddd %xmm10, %xmm10 -; SSE2-NEXT: paddd %xmm5, %xmm5 -; SSE2-NEXT: paddd %xmm11, %xmm11 -; SSE2-NEXT: paddd %xmm12, %xmm12 -; SSE2-NEXT: paddd %xmm9, %xmm9 -; SSE2-NEXT: paddd %xmm6, %xmm6 -; SSE2-NEXT: paddd %xmm13, %xmm13 -; SSE2-NEXT: paddd %xmm14, %xmm14 -; SSE2-NEXT: paddd %xmm8, %xmm8 -; SSE2-NEXT: paddd %xmm7, %xmm7 -; SSE2-NEXT: paddd %xmm15, %xmm15 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm15 -; SSE2-NEXT: psubd %xmm0, %xmm7 -; SSE2-NEXT: psubd %xmm0, %xmm8 -; SSE2-NEXT: psubd %xmm0, %xmm14 -; SSE2-NEXT: psubd %xmm0, %xmm13 -; SSE2-NEXT: psubd %xmm0, %xmm6 -; SSE2-NEXT: psubd %xmm0, %xmm9 -; SSE2-NEXT: psubd %xmm0, %xmm12 -; SSE2-NEXT: psubd %xmm0, %xmm11 -; SSE2-NEXT: psubd %xmm0, %xmm5 -; SSE2-NEXT: psubd %xmm0, %xmm10 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: psubd %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: psubd %xmm0, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: psubd %xmm0, %xmm3 -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm15 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: packuswb %xmm15, %xmm7 -; SSE2-NEXT: psrld $1, %xmm14 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: pand %xmm0, %xmm8 -; SSE2-NEXT: pand %xmm0, %xmm14 -; SSE2-NEXT: packuswb %xmm8, %xmm14 -; SSE2-NEXT: packuswb %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm0, %xmm13 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm13, %xmm6 -; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: psrld $1, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: packuswb %xmm9, %xmm12 -; SSE2-NEXT: packuswb %xmm6, %xmm12 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: packuswb %xmm11, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm10, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm5, %xmm4 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm5, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqa 48(%rsi), %xmm3 +; SSE2-NEXT: pavgb %xmm0, %xmm0 +; SSE2-NEXT: pavgb %xmm1, %xmm1 +; SSE2-NEXT: pavgb %xmm2, %xmm2 +; SSE2-NEXT: pavgb %xmm3, %xmm3 +; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm12, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v64i8_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpaddd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpaddd %xmm5, %xmm5, %xmm6 -; AVX1-NEXT: vpaddd %xmm4, %xmm4, %xmm5 -; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm2, %xmm3 -; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm1 -; AVX1-NEXT: vpaddd %xmm15, %xmm15, %xmm15 -; AVX1-NEXT: vpaddd %xmm14, %xmm14, %xmm14 -; AVX1-NEXT: vpaddd %xmm13, %xmm13, %xmm13 -; AVX1-NEXT: vpaddd %xmm12, %xmm12, %xmm12 -; AVX1-NEXT: vpaddd %xmm11, %xmm11, %xmm11 -; AVX1-NEXT: vpaddd %xmm10, %xmm10, %xmm10 -; AVX1-NEXT: vpaddd %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm0, %xmm8, %xmm7 -; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8 -; AVX1-NEXT: vpsubd %xmm0, %xmm10, %xmm10 -; AVX1-NEXT: vpsubd %xmm0, %xmm11, %xmm9 -; AVX1-NEXT: vpsubd %xmm0, %xmm12, %xmm7 -; AVX1-NEXT: vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpsubd %xmm0, %xmm13, %xmm11 -; AVX1-NEXT: vpsubd %xmm0, %xmm14, %xmm13 -; AVX1-NEXT: vpsubd %xmm0, %xmm15, %xmm12 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm15 -; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm2 -; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm14 -; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm5 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpsrld $1, %xmm8, %xmm6 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm8 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vpsrld $1, %xmm9, %xmm6 -; AVX1-NEXT: vpsrld $1, %xmm10, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpackuswb %xmm8, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm11, %xmm6 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm12, %xmm6 -; AVX1-NEXT: vpsrld $1, %xmm13, %xmm0 -; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpackuswb %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vpsrld $1, %xmm15, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm14, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm2 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovdqa (%rsi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpavgb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovups %ymm1, (%rax) ; AVX1-NEXT: vmovups %ymm0, (%rax) @@ -1628,71 +974,10 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind { ; ; AVX2-LABEL: avg_v64i8_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm7, %ymm7, %ymm7 -; AVX2-NEXT: vpaddd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpaddd %ymm5, %ymm5, %ymm5 -; AVX2-NEXT: vpaddd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm3, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm8, %ymm8, %ymm8 -; AVX2-NEXT: vpsubd %ymm8, %ymm0, %ymm9 -; AVX2-NEXT: vpsubd %ymm8, %ymm1, %ymm10 -; AVX2-NEXT: vpsubd %ymm8, %ymm2, %ymm2 -; AVX2-NEXT: vpsubd %ymm8, %ymm3, %ymm3 -; AVX2-NEXT: vpsubd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vpsubd %ymm8, %ymm5, %ymm5 -; AVX2-NEXT: vpsubd %ymm8, %ymm6, %ymm1 -; AVX2-NEXT: vpsubd %ymm8, %ymm7, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm11 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm12 -; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 -; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm6 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm7 -; AVX2-NEXT: vpsrld $1, %ymm10, %ymm8 -; AVX2-NEXT: vpsrld $1, %ymm9, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm0 -; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm8 -; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm8, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX2-NEXT: vpshufb %ymm2, %ymm12, %ymm4 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa (%rsi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-NEXT: vpavgb %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: vpavgb %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -1700,29 +985,10 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind { ; ; AVX512F-LABEL: avg_v64i8_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT: vpaddd %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpaddd %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddd %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 -; AVX512F-NEXT: vpsubd %zmm4, %zmm0, %zmm0 -; AVX512F-NEXT: vpsubd %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vpsubd %zmm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpsubd %zmm4, %zmm3, %zmm3 -; AVX512F-NEXT: vpsrld $1, %zmm3, %zmm3 -; AVX512F-NEXT: vpsrld $1, %zmm2, %zmm2 -; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1 -; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm2, %xmm1 -; AVX512F-NEXT: vpmovdb %zmm3, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-NEXT: vpavgb %ymm0, %ymm0, %ymm0 +; AVX512F-NEXT: vpavgb %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper @@ -1805,81 +1071,23 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind { define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v16i16_2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm2 -; SSE2-NEXT: movdqa 16(%rdi), %xmm4 -; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT: psubd %xmm4, %xmm3 -; SSE2-NEXT: psubd %xmm4, %xmm0 -; SSE2-NEXT: psubd %xmm4, %xmm2 -; SSE2-NEXT: psubd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm3, %xmm0 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: pavgw 16(%rdi), %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v16i16_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rsi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpavgw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1914,187 +1122,47 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind { define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; SSE2-LABEL: avg_v32i16_2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 -; SSE2-NEXT: movdqa 32(%rdi), %xmm10 -; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm9 -; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa 48(%rsi), %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm8, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: paddd %xmm4, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: paddd %xmm5, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: paddd %xmm12, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: paddd %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT: psubd %xmm0, %xmm7 -; SSE2-NEXT: psubd %xmm0, %xmm9 -; SSE2-NEXT: psubd %xmm0, %xmm6 -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: psubd %xmm0, %xmm5 -; SSE2-NEXT: psubd %xmm0, %xmm2 -; SSE2-NEXT: psubd %xmm0, %xmm4 -; SSE2-NEXT: psubd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm9 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: pslld $16, %xmm7 -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm9 -; SSE2-NEXT: psrad $16, %xmm9 -; SSE2-NEXT: packssdw %xmm7, %xmm9 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: pslld $16, %xmm4 -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: packssdw %xmm4, %xmm3 -; SSE2-NEXT: movdqu %xmm3, (%rax) +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 +; SSE2-NEXT: movdqa 32(%rsi), %xmm3 +; SSE2-NEXT: pavgw (%rsi), %xmm0 +; SSE2-NEXT: pavgw 16(%rsi), %xmm1 +; SSE2-NEXT: pavgw 32(%rdi), %xmm3 +; SSE2-NEXT: pavgw 48(%rsi), %xmm2 ; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm9, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i16_2: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm9 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm0, %xmm9, %xmm8 -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm4 -; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm6 -; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm9 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm8, %xmm7 -; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX1-NEXT: vpackusdw %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] -; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4],xmm0[5],xmm6[6],xmm0[7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2],xmm0[3],xmm9[4],xmm0[5],xmm9[6],xmm0[7] -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovups %ymm0, (%rax) +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vmovdqa (%rsi), %ymm2 +; AVX1-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpavgw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpavgw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpavgw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovups %ymm1, (%rax) +; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT: vpsubd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpsubd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpsubd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vmovdqu %ymm0, (%rax) ; AVX2-NEXT: vzeroupper @@ -2102,19 +1170,12 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind { ; ; AVX512F-LABEL: avg_v32i16_2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vpsubd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpsubd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1 -; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, (%rax) -; AVX512F-NEXT: vpmovdw %zmm1, (%rax) +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpavgw 32(%rdi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2209,89 +1270,21 @@ define void @avg_v16i8_const(<16 x i8>* %a) nounwind { define void @avg_v32i8_const(<32 x i8>* %a) nounwind { ; SSE2-LABEL: avg_v32i8_const: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,2,3,4] -; SSE2-NEXT: paddd %xmm9, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,6,7,8] -; SSE2-NEXT: paddd %xmm4, %xmm8 -; SSE2-NEXT: paddd %xmm9, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: packuswb %xmm7, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: packuswb %xmm6, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: packuswb %xmm8, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: pavgb 16(%rdi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i8_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,3,4] -; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8] -; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2 -; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm2 -; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm9, %xmm3 -; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-NEXT: vpavgb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -2324,211 +1317,33 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind { define void @avg_v64i8_const(<64 x i8>* %a) nounwind { ; SSE2-LABEL: avg_v64i8_const: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm5 -; SSE2-NEXT: movdqa 16(%rdi), %xmm6 -; SSE2-NEXT: movdqa 32(%rdi), %xmm15 -; SSE2-NEXT: movdqa 48(%rdi), %xmm11 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm14, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm15, %xmm12 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [5,6,7,8] -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm15 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,2,3,4] -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: packuswb %xmm5, %xmm7 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: packuswb %xmm7, %xmm1 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm6, %xmm4 -; SSE2-NEXT: psrld $1, %xmm8 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm8 -; SSE2-NEXT: packuswb %xmm3, %xmm8 -; SSE2-NEXT: packuswb %xmm4, %xmm8 -; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: psrld $1, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: packuswb %xmm15, %xmm12 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: psrld $1, %xmm14 -; SSE2-NEXT: pand %xmm0, %xmm14 -; SSE2-NEXT: pand %xmm0, %xmm13 -; SSE2-NEXT: packuswb %xmm14, %xmm13 -; SSE2-NEXT: packuswb %xmm12, %xmm13 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm11, %xmm2 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: packuswb %xmm3, %xmm10 -; SSE2-NEXT: packuswb %xmm2, %xmm10 -; SSE2-NEXT: movdqu %xmm10, (%rax) -; SSE2-NEXT: movdqu %xmm13, (%rax) -; SSE2-NEXT: movdqu %xmm8, (%rax) +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pavgb %xmm0, %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: pavgb %xmm0, %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm3 +; SSE2-NEXT: pavgb %xmm0, %xmm3 +; SSE2-NEXT: pavgb 48(%rdi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: movdqu %xmm3, (%rax) +; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v64i8_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [5,6,7,8] -; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm15 -; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm13 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm7 -; AVX1-NEXT: vpaddd %xmm0, %xmm11, %xmm11 -; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vpaddd %xmm0, %xmm9, %xmm9 -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4] -; AVX1-NEXT: vpaddd %xmm2, %xmm12, %xmm0 -; AVX1-NEXT: vpaddd %xmm2, %xmm10, %xmm10 -; AVX1-NEXT: vpaddd %xmm2, %xmm8, %xmm8 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm1 -; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm6 -; AVX1-NEXT: vpaddd -{{[0-9]+}}(%rsp), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm14 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm10, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm15, %xmm2 -; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm8, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm13, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm7, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm11, %xmm2 -; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm12, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm9, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm14, %xmm3 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-NEXT: vpavgb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpavgb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovups %ymm1, (%rax) ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -2536,82 +1351,21 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind { ; ; AVX2-LABEL: avg_v64i8_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [1,2,3,4,5,6,7,8] -; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7 -; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 -; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 -; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm8 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 -; AVX2-NEXT: vpsrld $1, %ymm6, %ymm6 -; AVX2-NEXT: vpsrld $1, %ymm7, %ymm7 -; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm0 -; AVX2-NEXT: vpackssdw %xmm0, %xmm7, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT: vpackssdw %xmm7, %xmm6, %xmm6 -; AVX2-NEXT: vpackuswb %xmm0, %xmm6, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4 -; AVX2-NEXT: vpackuswb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm8, %xmm3 -; AVX2-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, (%rax) +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] +; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v64i8_const: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8] -; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3 -; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1 -; AVX512F-NEXT: vpsrld $1, %zmm2, %zmm2 -; AVX512F-NEXT: vpsrld $1, %zmm3, %zmm3 -; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] +; AVX512F-NEXT: vpavgb (%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vpavgb 32(%rdi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqu %ymm0, (%rax) -; AVX512F-NEXT: vmovdqu %ymm2, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2680,57 +1434,21 @@ define void @avg_v8i16_const(<8 x i16>* %a) nounwind { define void @avg_v16i16_const(<16 x i16>* %a) nounwind { ; SSE2-LABEL: avg_v16i16_const: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm3 -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,6,7,8] -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2,3,4] -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pavgw %xmm0, %xmm1 +; SSE2-NEXT: pavgw 16(%rdi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v16i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,3,4] -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [5,6,7,8] -; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpavgw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -2763,148 +1481,57 @@ define void @avg_v16i16_const(<16 x i16>* %a) nounwind { define void @avg_v32i16_const(<32 x i16>* %a) nounwind { ; SSE2-LABEL: avg_v32i16_const: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm7 -; SSE2-NEXT: movdqa 16(%rdi), %xmm6 -; SSE2-NEXT: movdqa 32(%rdi), %xmm4 -; SSE2-NEXT: movdqa 48(%rdi), %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [5,6,7,8] -; SSE2-NEXT: paddd %xmm8, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,2,3,4] -; SSE2-NEXT: paddd %xmm9, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm6 -; SSE2-NEXT: paddd %xmm9, %xmm3 -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: paddd %xmm9, %xmm2 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: pslld $16, %xmm7 -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: packssdw %xmm7, %xmm5 -; SSE2-NEXT: pslld $16, %xmm6 -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: packssdw %xmm6, %xmm3 -; SSE2-NEXT: pslld $16, %xmm4 -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm4, %xmm2 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm0, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pavgw %xmm0, %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: pavgw %xmm0, %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm3 +; SSE2-NEXT: pavgw %xmm0, %xmm3 +; SSE2-NEXT: pavgw 48(%rdi), %xmm0 +; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: movdqu %xmm3, (%rax) -; SSE2-NEXT: movdqu %xmm5, (%rax) +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i16_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,3,4] -; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8] -; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm4, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm5, %xmm3 -; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsrld $1, %xmm6, %xmm3 -; AVX1-NEXT: vpsrld $1, %xmm9, %xmm4 -; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT: vpavgw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpavgw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpavgw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vmovups %ymm1, (%rax) ; AVX1-NEXT: vmovups %ymm0, (%rax) -; AVX1-NEXT: vmovups %ymm2, (%rax) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,3,4,5,6,7,8] -; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX2-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm1 +; AVX2-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vmovdqu %ymm2, (%rax) +; AVX2-NEXT: vmovdqu %ymm1, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v32i16_const: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8] -; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT: vpsrld $1, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovdw %zmm1, (%rax) -; AVX512F-NEXT: vpmovdw %zmm0, (%rax) +; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512F-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512F-NEXT: vpavgw (%rdi), %ymm0, %ymm1 +; AVX512F-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -2946,77 +1573,16 @@ define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind { define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind { ; SSE2-LABEL: avg_v32i8_3: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; SSE2-NEXT: paddw %xmm6, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE2-NEXT: paddw %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE2-NEXT: paddw %xmm7, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE2-NEXT: paddw %xmm3, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: psubw %xmm3, %xmm4 -; SSE2-NEXT: psubw %xmm3, %xmm0 -; SSE2-NEXT: psubw %xmm3, %xmm2 -; SSE2-NEXT: psubw %xmm3, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: pavgb %xmm2, %xmm0 +; SSE2-NEXT: pavgb %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v32i8_3: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT: vpaddw %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT: vpaddw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpaddw %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsubw %xmm1, %xmm4, %xmm4 -; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm1 -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpavgb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -3041,203 +1607,36 @@ define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind { define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind { ; SSE2-LABEL: avg_v64i8_3: ; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE2-NEXT: movdqa %xmm1, %xmm11 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE2-NEXT: movdqa %xmm2, %xmm12 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE2-NEXT: movdqa %xmm3, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; SSE2-NEXT: paddw %xmm10, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE2-NEXT: paddw %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSE2-NEXT: paddw %xmm11, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSE2-NEXT: paddw %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; SSE2-NEXT: paddw %xmm12, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE2-NEXT: paddw %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm7, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] -; SSE2-NEXT: paddw %xmm13, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; SSE2-NEXT: paddw %xmm7, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE2-NEXT: psubw %xmm7, %xmm8 -; SSE2-NEXT: psubw %xmm7, %xmm0 -; SSE2-NEXT: psubw %xmm7, %xmm4 -; SSE2-NEXT: psubw %xmm7, %xmm1 -; SSE2-NEXT: psubw %xmm7, %xmm5 -; SSE2-NEXT: psubw %xmm7, %xmm2 -; SSE2-NEXT: psubw %xmm7, %xmm6 -; SSE2-NEXT: psubw %xmm7, %xmm3 -; SSE2-NEXT: psrlw $1, %xmm3 -; SSE2-NEXT: psrlw $1, %xmm6 -; SSE2-NEXT: psrlw $1, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm5 -; SSE2-NEXT: psrlw $1, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm4 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: pand %xmm7, %xmm8 -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: packuswb %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm7, %xmm5 -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pavgb %xmm4, %xmm0 +; SSE2-NEXT: pavgb %xmm5, %xmm1 +; SSE2-NEXT: pavgb %xmm6, %xmm2 +; SSE2-NEXT: pavgb %xmm7, %xmm3 ; SSE2-NEXT: retq ; ; AVX1-LABEL: avg_v64i8_3: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm11 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpaddw %xmm7, %xmm5, %xmm12 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpaddw %xmm1, %xmm4, %xmm13 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpaddw %xmm4, %xmm6, %xmm14 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm15 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpaddw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpaddw %xmm2, %xmm11, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpaddw %xmm7, %xmm9, %xmm7 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpaddw %xmm3, %xmm10, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpsubw %xmm5, %xmm12, %xmm8 -; AVX1-NEXT: vpsubw %xmm5, %xmm13, %xmm4 -; AVX1-NEXT: vpsubw %xmm5, %xmm14, %xmm0 -; AVX1-NEXT: vpsubw %xmm5, %xmm15, %xmm1 -; AVX1-NEXT: vpsubw %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpsubw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpsubw %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vpsubw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm9 -; AVX1-NEXT: vpsrlw $1, %xmm7, %xmm5 -; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw $1, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlw $1, %xmm8, %xmm7 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm7 -; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm7[0],xmm4[0] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpavgb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpavgb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm1 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpavgb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpavgb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: avg_v64i8_3: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero -; AVX2-NEXT: vpaddw %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX2-NEXT: vpaddw %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero -; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX2-NEXT: vpsubw %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpsubw %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsubw %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpsubw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $1, %ymm4, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpavgb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpavgb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: avg_v64i8_3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512F-NEXT: vpavgb %xmm6, %xmm4, %xmm4 -; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX512F-NEXT: vpavgb %xmm6, %xmm5, %xmm5 -; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v64i8_3: diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll index 6e6d61f37d2e..248462d0de51 100644 --- a/test/CodeGen/X86/avx512-calling-conv.ll +++ b/test/CodeGen/X86/avx512-calling-conv.ll @@ -17,78 +17,40 @@ define <16 x i1> @test1() { } define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) { -; KNL-LABEL: test2: -; KNL: ## %bb.0: -; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vpslld $31, %zmm1, %zmm1 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: test2: -; SKX: ## %bb.0: -; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 -; SKX-NEXT: vpmovb2m %xmm1, %k0 -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 -; SKX-NEXT: vpmovb2m %xmm0, %k1 -; SKX-NEXT: kandw %k0, %k1, %k0 -; SKX-NEXT: vpmovm2b %k0, %xmm0 -; SKX-NEXT: retq +; ALL_X64-LABEL: test2: +; ALL_X64: ## %bb.0: +; ALL_X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL_X64-NEXT: vpsllw $7, %xmm0, %xmm0 +; ALL_X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; ALL_X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL_X64-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test2: ; KNL_X32: ## %bb.0: -; KNL_X32-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL_X32-NEXT: vpslld $31, %zmm1, %zmm1 -; KNL_X32-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0 -; KNL_X32-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL_X32-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0 +; KNL_X32-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL_X32-NEXT: vpsllw $7, %xmm0, %xmm0 +; KNL_X32-NEXT: vpand LCPI1_0, %xmm0, %xmm0 +; KNL_X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL_X32-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; KNL_X32-NEXT: retl %c = and <16 x i1>%a, %b ret <16 x i1> %c } define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) { -; KNL-LABEL: test3: -; KNL: ## %bb.0: -; KNL-NEXT: vpmovsxwq %xmm1, %zmm1 -; KNL-NEXT: vpsllq $63, %zmm1, %zmm1 -; KNL-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1} -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0 -; KNL-NEXT: retq -; -; SKX-LABEL: test3: -; SKX: ## %bb.0: -; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 -; SKX-NEXT: vpmovw2m %xmm1, %k0 -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: kandb %k0, %k1, %k0 -; SKX-NEXT: vpmovm2w %k0, %xmm0 -; SKX-NEXT: retq +; ALL_X64-LABEL: test3: +; ALL_X64: ## %bb.0: +; ALL_X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL_X64-NEXT: vpsllw $15, %xmm0, %xmm0 +; ALL_X64-NEXT: vpsraw $15, %xmm0, %xmm0 +; ALL_X64-NEXT: retq ; ; KNL_X32-LABEL: test3: ; KNL_X32: ## %bb.0: -; KNL_X32-NEXT: vpmovsxwq %xmm1, %zmm1 -; KNL_X32-NEXT: vpsllq $63, %zmm1, %zmm1 -; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0 -; KNL_X32-NEXT: vpsllq $63, %zmm0, %zmm0 -; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1} -; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0 -; KNL_X32-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0 +; KNL_X32-NEXT: vpand %xmm1, %xmm0, %xmm0 +; KNL_X32-NEXT: vpsllw $15, %xmm0, %xmm0 +; KNL_X32-NEXT: vpsraw $15, %xmm0, %xmm0 ; KNL_X32-NEXT: retl %c = and <8 x i1>%a, %b ret <8 x i1> %c @@ -102,11 +64,9 @@ define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) { ; ; SKX-LABEL: test4: ; SKX: ## %bb.0: -; SKX-NEXT: vpslld $31, %xmm1, %xmm1 +; SKX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vptestmd %xmm1, %xmm1, %k0 {%k1} -; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: vpsrad $31, %xmm0, %xmm0 ; SKX-NEXT: retq ; ; KNL_X32-LABEL: test4: diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll index 97beff63811a..8c7941591217 100644 --- a/test/CodeGen/X86/avx512-ext.ll +++ b/test/CodeGen/X86/avx512-ext.ll @@ -1366,21 +1366,12 @@ define i16 @trunc_16i32_to_16i1(<16 x i32> %a) { } define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) { -; KNL-LABEL: trunc_4i32_to_4i1: -; KNL: # %bb.0: -; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpslld $31, %xmm0, %xmm0 -; KNL-NEXT: vpsrad $31, %xmm0, %xmm0 -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_4i32_to_4i1: -; SKX: # %bb.0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vpslld $31, %xmm1, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} -; SKX-NEXT: vpmovm2d %k0, %xmm0 -; SKX-NEXT: retq +; ALL-LABEL: trunc_4i32_to_4i1: +; ALL: # %bb.0: +; ALL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpslld $31, %xmm0, %xmm0 +; ALL-NEXT: vpsrad $31, %xmm0, %xmm0 +; ALL-NEXT: retq %mask_a = trunc <4 x i32>%a to <4 x i1> %mask_b = trunc <4 x i32>%b to <4 x i1> %a_and_b = and <4 x i1>%mask_a, %mask_b diff --git a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll index 34ea468aebee..e1ed8ea98a1c 100644 --- a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -249,9 +249,9 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512: # %bb.0: ; AVX512-NEXT: kmovd (%rdi), %k0 ; AVX512-NEXT: kshiftrd $16, %k0, %k0 -; AVX512-NEXT: vpmovm2q %k0, %zmm2 -; AVX512-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512-NEXT: vpmovq2m %zmm2, %k1 +; AVX512-NEXT: vpmovm2d %k0, %ymm2 +; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512-NEXT: vmovaps %ymm1, (%rsi) ; AVX512-NEXT: vzeroupper @@ -261,10 +261,11 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 -; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2 -; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1 +; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} +; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper @@ -340,10 +341,10 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512: # %bb.0: ; AVX512-NEXT: kmovd (%rdi), %k0 ; AVX512-NEXT: kshiftrd $24, %k0, %k0 -; AVX512-NEXT: vpmovm2q %k0, %zmm2 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7] -; AVX512-NEXT: vpermq %zmm2, %zmm3, %zmm2 -; AVX512-NEXT: vpmovq2m %zmm2, %k1 +; AVX512-NEXT: vpmovm2d %k0, %ymm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512-NEXT: vmovaps %ymm1, (%rsi) ; AVX512-NEXT: vzeroupper @@ -353,11 +354,12 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1 -; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7] -; AVX512NOTDQ-NEXT: vpermq %zmm2, %zmm3, %zmm2 -; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2 -; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1 +; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} +; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] +; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper @@ -433,9 +435,9 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512: # %bb.0: ; AVX512-NEXT: kmovq (%rdi), %k0 ; AVX512-NEXT: kshiftrq $32, %k0, %k0 -; AVX512-NEXT: vpmovm2q %k0, %zmm2 -; AVX512-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512-NEXT: vpmovq2m %zmm2, %k1 +; AVX512-NEXT: vpmovm2d %k0, %ymm2 +; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512-NEXT: vmovaps %ymm1, (%rsi) ; AVX512-NEXT: vzeroupper @@ -445,10 +447,11 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 -; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2 -; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1 +; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} +; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper @@ -555,10 +558,10 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512: # %bb.0: ; AVX512-NEXT: kmovq (%rdi), %k0 ; AVX512-NEXT: kshiftrq $56, %k0, %k0 -; AVX512-NEXT: vpmovm2q %k0, %zmm2 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7] -; AVX512-NEXT: vpermq %zmm2, %zmm3, %zmm2 -; AVX512-NEXT: vpmovq2m %zmm2, %k1 +; AVX512-NEXT: vpmovm2d %k0, %ymm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512-NEXT: vmovaps %ymm1, (%rsi) ; AVX512-NEXT: vzeroupper @@ -568,11 +571,12 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1 -; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7] -; AVX512NOTDQ-NEXT: vpermq %zmm2, %zmm3, %zmm2 -; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2 -; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1 +; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} +; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] +; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512NOTDQ-NEXT: vpslld $31, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper @@ -1054,9 +1058,9 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovd (%rdi), %k0 ; AVX512-NEXT: kshiftrd $16, %k0, %k0 -; AVX512-NEXT: vpmovm2q %k0, %zmm0 -; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512-NEXT: vpmovq2m %zmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %ymm0 +; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX512-NEXT: vpmovd2m %ymm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1065,10 +1069,11 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 -; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper @@ -1159,10 +1164,10 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovd (%rdi), %k0 ; AVX512-NEXT: kshiftrd $24, %k0, %k0 -; AVX512-NEXT: vpmovm2q %k0, %zmm0 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7] -; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vpmovq2m %zmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512-NEXT: vpmovd2m %ymm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1171,11 +1176,12 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1 -; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7] -; AVX512NOTDQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] +; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper @@ -1266,9 +1272,9 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovq (%rdi), %k0 ; AVX512-NEXT: kshiftrq $32, %k0, %k0 -; AVX512-NEXT: vpmovm2q %k0, %zmm0 -; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512-NEXT: vpmovq2m %zmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %ymm0 +; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX512-NEXT: vpmovd2m %ymm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1277,10 +1283,11 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 -; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper @@ -1399,10 +1406,10 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512: # %bb.0: ; AVX512-NEXT: kmovq (%rdi), %k0 ; AVX512-NEXT: kshiftrq $56, %k0, %k0 -; AVX512-NEXT: vpmovm2q %k0, %zmm0 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7] -; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vpmovq2m %zmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512-NEXT: vpmovd2m %ymm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1411,11 +1418,12 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { ; AVX512NOTDQ: # %bb.0: ; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 ; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1 -; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7] -; AVX512NOTDQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] +; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512NOTDQ-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) ; AVX512NOTDQ-NEXT: vzeroupper diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index 7e0b981b2c6a..7477e05f0c7f 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -793,11 +793,10 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: setb %al ; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k1 -; KNL-NEXT: movl {{.*}}(%rip), %ecx -; KNL-NEXT: vpbroadcastd %ecx, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k1 -; KNL-NEXT: vpbroadcastd %ecx, %zmm1 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 @@ -1432,8 +1431,7 @@ define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) { ; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi ; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: andl $15, %edi -; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movb (%rdi,%rax), %al +; CHECK-NEXT: movb -24(%rsp,%rdi), %al ; CHECK-NEXT: retq %t2 = extractelement <16 x i8> %t1, i32 %index ret i8 %t2 @@ -1452,8 +1450,7 @@ define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) { ; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi ; CHECK-NEXT: vmovaps %ymm0, (%rsp) ; CHECK-NEXT: andl $31, %edi -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: movb (%rdi,%rax), %al +; CHECK-NEXT: movb (%rsp,%rdi), %al ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: vzeroupper @@ -1477,8 +1474,7 @@ define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) { ; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovaps %ymm0, (%rsp) ; KNL-NEXT: andl $63, %edi -; KNL-NEXT: movq %rsp, %rax -; KNL-NEXT: movb (%rdi,%rax), %al +; KNL-NEXT: movb (%rsp,%rdi), %al ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: vzeroupper @@ -1496,8 +1492,7 @@ define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) { ; SKX-NEXT: ## kill: def %edi killed %edi def %rdi ; SKX-NEXT: vmovaps %zmm0, (%rsp) ; SKX-NEXT: andl $63, %edi -; SKX-NEXT: movq %rsp, %rax -; SKX-NEXT: movb (%rdi,%rax), %al +; SKX-NEXT: movb (%rsp,%rdi), %al ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp ; SKX-NEXT: vzeroupper @@ -1522,8 +1517,7 @@ define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) ; KNL-NEXT: vmovaps %ymm0, (%rsp) ; KNL-NEXT: movzbl %dil, %eax ; KNL-NEXT: andl $63, %eax -; KNL-NEXT: movq %rsp, %rcx -; KNL-NEXT: movb (%rax,%rcx), %al +; KNL-NEXT: movb (%rsp,%rax), %al ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: vzeroupper @@ -1542,8 +1536,7 @@ define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) ; SKX-NEXT: vmovaps %zmm0, (%rsp) ; SKX-NEXT: movzbl %dil, %eax ; SKX-NEXT: andl $63, %eax -; SKX-NEXT: movq %rsp, %rcx -; SKX-NEXT: movb (%rax,%rcx), %al +; SKX-NEXT: movb (%rsp,%rax), %al ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp ; SKX-NEXT: vzeroupper @@ -1617,45 +1610,28 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) { ; KNL-LABEL: test_extractelement_varible_v8i1: ; KNL: ## %bb.0: -; KNL-NEXT: pushq %rbp -; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: .cfi_offset %rbp, -16 -; KNL-NEXT: movq %rsp, %rbp -; KNL-NEXT: .cfi_def_cfa_register %rbp -; KNL-NEXT: andq $-64, %rsp -; KNL-NEXT: subq $128, %rsp ; KNL-NEXT: ## kill: def %edi killed %edi def %rdi ; KNL-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1 ; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0 ; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 -; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vmovdqa64 %zmm0, (%rsp) +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpmovdw %zmm0, %ymm0 +; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $7, %edi -; KNL-NEXT: movzbl (%rsp,%rdi,8), %eax +; KNL-NEXT: movzbl -24(%rsp,%rdi,2), %eax ; KNL-NEXT: andl $1, %eax -; KNL-NEXT: movq %rbp, %rsp -; KNL-NEXT: popq %rbp ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_varible_v8i1: ; SKX: ## %bb.0: -; SKX-NEXT: pushq %rbp -; SKX-NEXT: .cfi_def_cfa_offset 16 -; SKX-NEXT: .cfi_offset %rbp, -16 -; SKX-NEXT: movq %rsp, %rbp -; SKX-NEXT: .cfi_def_cfa_register %rbp -; SKX-NEXT: andq $-64, %rsp -; SKX-NEXT: subq $128, %rsp ; SKX-NEXT: ## kill: def %edi killed %edi def %rdi ; SKX-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 -; SKX-NEXT: vpmovm2q %k0, %zmm0 -; SKX-NEXT: vmovdqa64 %zmm0, (%rsp) +; SKX-NEXT: vpmovm2w %k0, %xmm0 +; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; SKX-NEXT: andl $7, %edi -; SKX-NEXT: movzbl (%rsp,%rdi,8), %eax +; SKX-NEXT: movzbl -24(%rsp,%rdi,2), %eax ; SKX-NEXT: andl $1, %eax -; SKX-NEXT: movq %rbp, %rsp -; SKX-NEXT: popq %rbp ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t1 = icmp ugt <8 x i32> %a, %b @@ -1667,43 +1643,26 @@ define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) { ; KNL-LABEL: test_extractelement_varible_v16i1: ; KNL: ## %bb.0: -; KNL-NEXT: pushq %rbp -; KNL-NEXT: .cfi_def_cfa_offset 16 -; KNL-NEXT: .cfi_offset %rbp, -16 -; KNL-NEXT: movq %rsp, %rbp -; KNL-NEXT: .cfi_def_cfa_register %rbp -; KNL-NEXT: andq $-64, %rsp -; KNL-NEXT: subq $128, %rsp ; KNL-NEXT: ## kill: def %edi killed %edi def %rdi ; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vmovdqa32 %zmm0, (%rsp) +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $15, %edi -; KNL-NEXT: movzbl (%rsp,%rdi,4), %eax +; KNL-NEXT: movzbl -24(%rsp,%rdi), %eax ; KNL-NEXT: andl $1, %eax -; KNL-NEXT: movq %rbp, %rsp -; KNL-NEXT: popq %rbp ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; ; SKX-LABEL: test_extractelement_varible_v16i1: ; SKX: ## %bb.0: -; SKX-NEXT: pushq %rbp -; SKX-NEXT: .cfi_def_cfa_offset 16 -; SKX-NEXT: .cfi_offset %rbp, -16 -; SKX-NEXT: movq %rsp, %rbp -; SKX-NEXT: .cfi_def_cfa_register %rbp -; SKX-NEXT: andq $-64, %rsp -; SKX-NEXT: subq $128, %rsp ; SKX-NEXT: ## kill: def %edi killed %edi def %rdi ; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; SKX-NEXT: vpmovm2d %k0, %zmm0 -; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) +; SKX-NEXT: vpmovm2b %k0, %xmm0 +; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; SKX-NEXT: andl $15, %edi -; SKX-NEXT: movzbl (%rsp,%rdi,4), %eax +; SKX-NEXT: movzbl -24(%rsp,%rdi), %eax ; SKX-NEXT: andl $1, %eax -; SKX-NEXT: movq %rbp, %rsp -; SKX-NEXT: popq %rbp ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %t1 = icmp ugt <16 x i32> %a, %b @@ -1729,8 +1688,7 @@ define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, ; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vmovdqa %ymm0, (%rsp) ; KNL-NEXT: andl $31, %edi -; KNL-NEXT: movq %rsp, %rax -; KNL-NEXT: movzbl (%rdi,%rax), %eax +; KNL-NEXT: movzbl (%rsp,%rdi), %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp @@ -1744,14 +1702,14 @@ define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, ; SKX-NEXT: .cfi_offset %rbp, -16 ; SKX-NEXT: movq %rsp, %rbp ; SKX-NEXT: .cfi_def_cfa_register %rbp -; SKX-NEXT: andq $-64, %rsp -; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: andq $-32, %rsp +; SKX-NEXT: subq $64, %rsp ; SKX-NEXT: ## kill: def %edi killed %edi def %rdi ; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 -; SKX-NEXT: vpmovm2w %k0, %zmm0 -; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) +; SKX-NEXT: vpmovm2b %k0, %ymm0 +; SKX-NEXT: vmovdqa %ymm0, (%rsp) ; SKX-NEXT: andl $31, %edi -; SKX-NEXT: movzbl (%rsp,%rdi,2), %eax +; SKX-NEXT: movzbl (%rsp,%rdi), %eax ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp @@ -1792,8 +1750,7 @@ define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) { ; KNL-NEXT: andl $31, %esi ; KNL-NEXT: testb %dil, %dil ; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) -; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; KNL-NEXT: setne (%rsi,%rax) +; KNL-NEXT: setne 32(%rsp,%rsi) ; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 @@ -1817,20 +1774,18 @@ define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) { ; SKX-NEXT: .cfi_offset %rbp, -16 ; SKX-NEXT: movq %rsp, %rbp ; SKX-NEXT: .cfi_def_cfa_register %rbp -; SKX-NEXT: andq $-64, %rsp -; SKX-NEXT: subq $128, %rsp +; SKX-NEXT: andq $-32, %rsp +; SKX-NEXT: subq $64, %rsp ; SKX-NEXT: ## kill: def %esi killed %esi def %rsi ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k1 -; SKX-NEXT: xorl %eax, %eax -; SKX-NEXT: testb %dil, %dil -; SKX-NEXT: setne %al -; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} -; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) +; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 ; SKX-NEXT: andl $31, %esi -; SKX-NEXT: movw %ax, (%rsp,%rsi,2) -; SKX-NEXT: vpsllw $15, (%rsp), %zmm0 -; SKX-NEXT: vpmovw2m %zmm0, %k0 +; SKX-NEXT: testb %dil, %dil +; SKX-NEXT: vpmovm2b %k0, %ymm0 +; SKX-NEXT: vmovdqa %ymm0, (%rsp) +; SKX-NEXT: setne (%rsp,%rsi) +; SKX-NEXT: vpsllw $7, (%rsp), %ymm0 +; SKX-NEXT: vpmovb2m %ymm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp @@ -1863,8 +1818,7 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) { ; KNL-NEXT: testb %dil, %dil ; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) -; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; KNL-NEXT: setne (%rsi,%rax) +; KNL-NEXT: setne 64(%rsp,%rsi) ; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0 ; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2 @@ -1905,13 +1859,12 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) { ; SKX-NEXT: subq $128, %rsp ; SKX-NEXT: ## kill: def %esi killed %esi def %rsi ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 +; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 ; SKX-NEXT: andl $63, %esi ; SKX-NEXT: testb %dil, %dil -; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) -; SKX-NEXT: movq %rsp, %rax -; SKX-NEXT: setne (%rsi,%rax) +; SKX-NEXT: setne (%rsp,%rsi) ; SKX-NEXT: vpsllw $7, (%rsp), %zmm0 ; SKX-NEXT: vpmovb2m %zmm0, %k0 ; SKX-NEXT: kmovq %k0, %rax @@ -2050,8 +2003,7 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) { ; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp) -; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; KNL-NEXT: setne (%rax,%rcx) +; KNL-NEXT: setne 128(%rsp,%rax) ; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 ; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2 ; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3 @@ -2215,18 +2167,16 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) { ; SKX-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2 ; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k1 -; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k2 +; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k0 +; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k1 ; SKX-NEXT: movl 744(%rbp), %eax ; SKX-NEXT: andl $127, %eax ; SKX-NEXT: cmpb $0, 736(%rbp) -; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SKX-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} {z} -; SKX-NEXT: vmovdqa32 %zmm1, {{[0-9]+}}(%rsp) -; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vpmovm2b %k1, %zmm0 +; SKX-NEXT: vmovdqa32 %zmm0, {{[0-9]+}}(%rsp) +; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) -; SKX-NEXT: movq %rsp, %rcx -; SKX-NEXT: setne (%rax,%rcx) +; SKX-NEXT: setne (%rsp,%rax) ; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0 ; SKX-NEXT: vpmovb2m %zmm0, %k0 ; SKX-NEXT: vpsllw $7, (%rsp), %zmm0 @@ -2270,8 +2220,7 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index ; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) -; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; KNL-NEXT: setne (%rsi,%rax) +; KNL-NEXT: setne 128(%rsp,%rsi) ; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1 ; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2 ; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3 @@ -2336,17 +2285,15 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index ; SKX-NEXT: subq $256, %rsp ## imm = 0x100 ; SKX-NEXT: ## kill: def %esi killed %esi def %rsi ; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k1 -; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k2 +; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k0 +; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k1 ; SKX-NEXT: andl $127, %esi ; SKX-NEXT: testb %dil, %dil -; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SKX-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} {z} -; SKX-NEXT: vmovdqa32 %zmm1, {{[0-9]+}}(%rsp) -; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vpmovm2b %k1, %zmm0 +; SKX-NEXT: vmovdqa32 %zmm0, {{[0-9]+}}(%rsp) +; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) -; SKX-NEXT: movq %rsp, %rax -; SKX-NEXT: setne (%rsi,%rax) +; SKX-NEXT: setne (%rsp,%rsi) ; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0 ; SKX-NEXT: vpmovb2m %zmm0, %k0 ; SKX-NEXT: vpsllw $7, (%rsp), %zmm0 diff --git a/test/CodeGen/X86/avx512-insert-extract_i1.ll b/test/CodeGen/X86/avx512-insert-extract_i1.ll index e28e384ae996..9283fd32d746 100644 --- a/test/CodeGen/X86/avx512-insert-extract_i1.ll +++ b/test/CodeGen/X86/avx512-insert-extract_i1.ll @@ -18,8 +18,7 @@ define zeroext i8 @test_extractelement_varible_v64i1(<64 x i8> %a, <64 x i8> %b, ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: vmovdqa32 %zmm0, (%rsp) ; SKX-NEXT: andl $63, %edi -; SKX-NEXT: movq %rsp, %rax -; SKX-NEXT: movzbl (%rdi,%rax), %eax +; SKX-NEXT: movzbl (%rsp,%rdi), %eax ; SKX-NEXT: andl $1, %eax ; SKX-NEXT: movq %rbp, %rsp ; SKX-NEXT: popq %rbp diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index dfe42d53483f..4877157d911d 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -1783,20 +1783,19 @@ define void @ktest_2(<32 x float> %in, float * %base) { ; KNL-NEXT: vmovups (%rdi), %zmm2 ; KNL-NEXT: vmovups 64(%rdi), %zmm3 ; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k1 -; KNL-NEXT: movl {{.*}}(%rip), %eax -; KNL-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm2, %xmm2 ; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k2 -; KNL-NEXT: vpbroadcastd %eax, %zmm3 {%k2} {z} +; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} ; KNL-NEXT: vpmovdb %zmm3, %xmm3 ; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z} ; KNL-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z} ; KNL-NEXT: vcmpltps %zmm5, %zmm0, %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm5 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm5, %xmm5 ; KNL-NEXT: vpor %xmm5, %xmm2, %xmm2 ; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm4 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm4, %xmm4 ; KNL-NEXT: vpor %xmm4, %xmm3, %xmm3 ; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 @@ -1886,20 +1885,19 @@ define void @ktest_2(<32 x float> %in, float * %base) { ; AVX512DQ-NEXT: vmovups (%rdi), %zmm2 ; AVX512DQ-NEXT: vmovups 64(%rdi), %zmm3 ; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k1 -; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k2 -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm3 {%k2} {z} +; AVX512DQ-NEXT: vpmovm2d %k2, %zmm3 ; AVX512DQ-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512DQ-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z} ; AVX512DQ-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT: vcmpltps %zmm5, %zmm0, %k1 -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm5 {%k1} {z} +; AVX512DQ-NEXT: vcmpltps %zmm5, %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm5 ; AVX512DQ-NEXT: vpmovdb %zmm5, %xmm5 ; AVX512DQ-NEXT: vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-NEXT: vcmpltps %zmm4, %zmm1, %k1 -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm4 {%k1} {z} +; AVX512DQ-NEXT: vcmpltps %zmm4, %zmm1, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 ; AVX512DQ-NEXT: vpmovdb %zmm4, %xmm4 ; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3 diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll index 1b450b98a6d5..78111874b58a 100755 --- a/test/CodeGen/X86/avx512-schedule.ll +++ b/test/CodeGen/X86/avx512-schedule.ll @@ -4376,20 +4376,16 @@ define i16 @trunc_16i32_to_16i1(<16 x i32> %a) { define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) { ; GENERIC-LABEL: trunc_4i32_to_4i1: ; GENERIC: # %bb.0: +; GENERIC-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} # sched: [1:1.00] -; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpsrad $31, %xmm0, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: trunc_4i32_to_4i1: ; SKX: # %bb.0: +; SKX-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33] ; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] -; SKX-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:0.50] -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} # sched: [3:1.00] -; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25] +; SKX-NEXT: vpsrad $31, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: retq # sched: [7:1.00] %mask_a = trunc <4 x i32>%a to <4 x i1> %mask_b = trunc <4 x i32>%b to <4 x i1> diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index df88f0fca456..0601c011e290 100644 --- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1488,12 +1488,10 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7] -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> @@ -1503,13 +1501,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 -; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm0, %xmm2, %k1 +; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> @@ -1522,13 +1518,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1 +; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> @@ -1863,14 +1857,12 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[3,1,2,3,7,5,6,7] -; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm2 -; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6] -; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] -; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1 -; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [15,5,3,2,15,5,7,6] +; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -1884,14 +1876,12 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1 -; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[3,1,2,3,7,5,6,7] -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; CHECK-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,3,2,4,5,7,6] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1 -; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6] +; CHECK-NEXT: vpermi2d %ymm1, %ymm2, %ymm3 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp @@ -2298,13 +2288,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm0[3,1,2,3] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,3,1] -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,3,1] +; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2315,13 +2304,12 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm0[3,1,2,3] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,3,1] -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,3,3,1] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2332,12 +2320,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,1,0,6] +; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2348,13 +2335,12 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,1,0,6] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2405,13 +2391,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,0,3,3] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4] +; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2422,13 +2407,12 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,0,3,3] -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2585,12 +2569,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,1,2,1] -; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,1,1,5] +; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5> @@ -2602,14 +2585,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,1,2,1] -; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1 -; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,1,1,5] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5> @@ -2669,12 +2651,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5] -; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5],ymm2[6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,4,6,1] +; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1> @@ -2686,14 +2667,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1 -; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,6,1] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1> @@ -2739,11 +2719,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,2] -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 +; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,2,3,2] +; CHECK-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> @@ -2754,12 +2733,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,1,2,3] -; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,3,2] -; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,2,3,2] +; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> @@ -2771,14 +2749,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3] -; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,2] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1 -; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,2,3,2] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> @@ -3307,14 +3284,13 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %v define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = <0,4,u,u,6,1,4,4> -; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] +; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,4,10,11,6,1,4,4] +; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1 +; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3325,14 +3301,13 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = <0,4,u,u,6,1,4,4> -; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vmovddup {{.*#+}} xmm3 = xmm2[0,0] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,10,11,6,1,4,4] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4> %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3775,12 +3750,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %zmm2 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[2,3],ymm3[4,6],ymm2[6,7] -; CHECK-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7] -; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,10,6,15,4,14,6,15] +; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vmovaps %xmm4, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3795,12 +3769,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %zmm1 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,3],ymm2[4,6],ymm1[6,7] -; CHECK-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 -; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,10,6,15,4,14,6,15] +; CHECK-NEXT: vpermi2ps %ymm1, %ymm2, %ymm3 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 +; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3815,12 +3788,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %zmm2 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[0,0],ymm2[6,4],ymm3[4,4] -; CHECK-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[2,3],ymm2[6,4],ymm3[6,7] -; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1 -; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [4,14,4,14,4,14,6,7] +; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT: vmovaps %xmm4, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp @@ -3835,12 +3807,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %zmm1 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4] -; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,3],ymm1[6,4],ymm2[6,7] -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1 -; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [4,14,4,14,4,14,6,7] +; CHECK-NEXT: vpermi2ps %ymm1, %ymm2, %ymm3 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1 +; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x float>, <16 x float>* %vp diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll index 1182bbf94ec5..6bee0de181ab 100644 --- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -30,10 +30,9 @@ define <8 x i1> @test2(<2 x i1> %a) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 ; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0 -; CHECK-NEXT: vpmovm2q %k0, %zmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; CHECK-NEXT: vpmovq2m %zmm0, %k0 +; CHECK-NEXT: vpmovm2d %k0, %ymm0 +; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; CHECK-NEXT: vpmovd2m %ymm0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index fc684e54b063..826a4538f3f1 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -192,15 +192,14 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind { ; KNL-NEXT: andq $-32, %rsp ; KNL-NEXT: subq $32, %rsp ; KNL-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 -; KNL-NEXT: movl {{.*}}(%rip), %eax -; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 ; KNL-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 -; KNL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll index 9863a0a7d283..5f4b050b863d 100644 --- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -101,7 +101,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -147,7 +147,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -282,7 +282,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -333,7 +333,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -1354,7 +1354,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -1401,7 +1401,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -1543,7 +1543,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -1595,7 +1595,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -4799,7 +4799,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -4839,7 +4839,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -4882,7 +4882,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x ; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -4926,7 +4926,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -4970,7 +4970,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %_ ; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -5014,7 +5014,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u ; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -5061,7 +5061,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -5106,7 +5106,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64> ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -5154,7 +5154,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -5203,7 +5203,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -5252,7 +5252,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -5301,7 +5301,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -9211,7 +9211,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -9257,7 +9257,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -9392,7 +9392,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -9443,7 +9443,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -10464,7 +10464,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -10511,7 +10511,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -10653,7 +10653,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -10705,7 +10705,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -13909,7 +13909,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -13949,7 +13949,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -13992,7 +13992,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -14036,7 +14036,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -14080,7 +14080,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -14124,7 +14124,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -14171,7 +14171,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -14216,7 +14216,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -14264,7 +14264,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -14313,7 +14313,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -14362,7 +14362,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -14411,7 +14411,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -18328,7 +18328,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -18377,7 +18377,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -18519,7 +18519,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -18573,7 +18573,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -19641,7 +19641,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -19691,7 +19691,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -19840,7 +19840,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -19895,7 +19895,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -23147,7 +23147,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -23187,7 +23187,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -23230,7 +23230,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -23274,7 +23274,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -23318,7 +23318,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpcmpnltd (%rdi){1to16}, %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -23362,7 +23362,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpcmpnltd (%rsi){1to16}, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -23409,7 +23409,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -23454,7 +23454,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -23502,7 +23502,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -23551,7 +23551,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -23600,7 +23600,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -23649,7 +23649,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -27664,7 +27664,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -27713,7 +27713,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -27857,7 +27857,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -27911,7 +27911,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -28989,7 +28989,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -29039,7 +29039,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -29190,7 +29190,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -29245,7 +29245,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -32541,7 +32541,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -32581,7 +32581,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -32624,7 +32624,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -32668,7 +32668,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -32712,7 +32712,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -32756,7 +32756,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -32803,7 +32803,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -32848,7 +32848,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -32896,7 +32896,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -32945,7 +32945,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -32994,7 +32994,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* % ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -33043,7 +33043,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -39363,7 +39363,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -39403,7 +39403,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -39444,7 +39444,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* ; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -39488,7 +39488,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -39532,7 +39532,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -39577,7 +39577,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k1 {%k1} ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -39666,7 +39666,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -39711,7 +39711,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -39757,7 +39757,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -39806,7 +39806,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -39855,7 +39855,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u, ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -39905,7 +39905,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__ ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 @@ -43827,7 +43827,7 @@ define i32 @test_cmpm_rnd_zero(<16 x float> %a, <16 x float> %b) { ; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k1 ; NoVLX-NEXT: kxorw %k0, %k0, %k0 ; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; NoVLX-NEXT: vpmovdb %zmm0, %xmm0 ; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 ; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll index 2276e5634537..78c44e4dca3b 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -30,13 +30,13 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) { ; AVX512F-LABEL: v8i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper @@ -943,12 +943,12 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) { ; AVX512F-NEXT: vpsllw $8, %xmm0, %xmm0 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} +; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1 +; AVX512F-NEXT: vpmovsxwd %xmm2, %ymm0 +; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 {%k1} ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll index 7d0381837b70..8fdacb7b79d6 100644 --- a/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-setcc-128.ll @@ -26,9 +26,9 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) { ; AVX512F-LABEL: v8i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper @@ -640,9 +640,9 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) { ; AVX512F-NEXT: vpsllw $8, %xmm0, %xmm0 ; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: # kill: def %al killed %al killed %eax ; AVX512F-NEXT: vzeroupper diff --git a/test/CodeGen/X86/combine-and.ll b/test/CodeGen/X86/combine-and.ll index e92237f524f5..c2da74a1646c 100644 --- a/test/CodeGen/X86/combine-and.ll +++ b/test/CodeGen/X86/combine-and.ll @@ -220,6 +220,16 @@ define <4 x i32> @and_or_v4i32(<4 x i32> %a0) { ret <4 x i32> %2 } +define <8 x i16> @and_or_v8i16(<8 x i16> %a0) { +; CHECK-LABEL: and_or_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [15,7,3,1,14,10,2,32767] +; CHECK-NEXT: retq + %1 = or <8 x i16> %a0, <i16 255, i16 127, i16 63, i16 31, i16 15, i16 31, i16 63, i16 -1> + %2 = and <8 x i16> %1, <i16 15, i16 7, i16 3, i16 1, i16 14, i16 10, i16 2, i16 32767> + ret <8 x i16> %2 +} + ; ; known bits folding ; diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll index 1601c67dce25..65184c4d278e 100644 --- a/test/CodeGen/X86/combine-or.ll +++ b/test/CodeGen/X86/combine-or.ll @@ -430,25 +430,35 @@ define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) { ret <4 x i32> %or } -; TODO: Why would we do this? -; (or (and X, c1), c2) -> (and (or X, c2), c1|c2) +; (or (and X, c1), c2) -> (and (or X, c2), c1|c2) iff (c1 & c2) != 0 define <2 x i64> @or_and_v2i64(<2 x i64> %a0) { ; CHECK-LABEL: or_and_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 ; CHECK-NEXT: orps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq %1 = and <2 x i64> %a0, <i64 7, i64 7> %2 = or <2 x i64> %1, <i64 3, i64 3> ret <2 x i64> %2 } -; If all masked bits are going to be set, that's a constant fold. - define <4 x i32> @or_and_v4i32(<4 x i32> %a0) { ; CHECK-LABEL: or_and_v4i32: ; CHECK: # %bb.0: +; CHECK-NEXT: orps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq + %1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7> + %2 = or <4 x i32> %1, <i32 3, i32 2, i32 15, i32 2> + ret <4 x i32> %2 +} + +; If all masked bits are going to be set, that's a constant fold. + +define <4 x i32> @or_and_v4i32_fold(<4 x i32> %a0) { +; CHECK-LABEL: or_and_v4i32_fold: +; CHECK: # %bb.0: ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [3,3,3,3] ; CHECK-NEXT: retq %1 = and <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1> diff --git a/test/CodeGen/X86/darwin-bzero.ll b/test/CodeGen/X86/darwin-bzero.ll index 3099526028ab..410d67ff0ec1 100644 --- a/test/CodeGen/X86/darwin-bzero.ll +++ b/test/CodeGen/X86/darwin-bzero.ll @@ -1,7 +1,10 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin10 | grep __bzero +; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind +; CHECK-LABEL: foo: +; CHECK: {{calll|callq}} ___bzero define void @foo(i8* %p, i32 %len) { call void @llvm.memset.p0i8.i32(i8* %p, i8 0, i32 %len, i32 1, i1 false) ret void diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll index 4d24a15fe2e1..66bdfb8475f1 100644 --- a/test/CodeGen/X86/extractelement-index.ll +++ b/test/CodeGen/X86/extractelement-index.ll @@ -403,16 +403,14 @@ define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: andl $15, %edi ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movb (%rdi,%rax), %al +; SSE-NEXT: movb -24(%rsp,%rdi), %al ; SSE-NEXT: retq ; ; AVX-LABEL: extractelement_v16i8_var: ; AVX: # %bb.0: ; AVX-NEXT: andl $15, %edi ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movb (%rdi,%rax), %al +; AVX-NEXT: movb -24(%rsp,%rdi), %al ; AVX-NEXT: retq %b = extractelement <16 x i8> %a, i256 %i ret i8 %b @@ -428,8 +426,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind { ; SSE-NEXT: andl $31, %edi ; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; SSE-NEXT: movaps %xmm0, (%rsp) -; SSE-NEXT: movq %rsp, %rax -; SSE-NEXT: movb (%rdi,%rax), %al +; SSE-NEXT: movb (%rsp,%rdi), %al ; SSE-NEXT: movq %rbp, %rsp ; SSE-NEXT: popq %rbp ; SSE-NEXT: retq @@ -442,8 +439,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind { ; AVX-NEXT: subq $64, %rsp ; AVX-NEXT: andl $31, %edi ; AVX-NEXT: vmovaps %ymm0, (%rsp) -; AVX-NEXT: movq %rsp, %rax -; AVX-NEXT: movb (%rdi,%rax), %al +; AVX-NEXT: movb (%rsp,%rdi), %al ; AVX-NEXT: movq %rbp, %rsp ; AVX-NEXT: popq %rbp ; AVX-NEXT: vzeroupper diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll index 8dacf2dcf971..a0e919d128df 100644 --- a/test/CodeGen/X86/fma-fneg-combine.ll +++ b/test/CodeGen/X86/fma-fneg-combine.ll @@ -140,21 +140,23 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x doubl define <4 x float> @test11(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { ; SKX-LABEL: test11: ; SKX: # %bb.0: # %entry -; SKX-NEXT: vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm0 +; SKX-NEXT: vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm2 ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1} +; SKX-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1} +; SKX-NEXT: vmovaps %xmm2, %xmm0 ; SKX-NEXT: retq ; ; KNL-LABEL: test11: ; KNL: # %bb.0: # %entry -; KNL-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0,-0,-0,-0] -; KNL-NEXT: vxorps %xmm0, %xmm2, %xmm0 +; KNL-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0] +; KNL-NEXT: vxorps %xmm3, %xmm2, %xmm2 ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1} +; KNL-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1} +; KNL-NEXT: vmovaps %xmm2, %xmm0 ; KNL-NEXT: retq entry: %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c - %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10 + %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10 ret <4 x float> %0 } @@ -164,19 +166,17 @@ define <4 x float> @test11b(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 z ; SKX-LABEL: test11b: ; SKX: # %bb.0: # %entry ; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm1 {%k1} -; SKX-NEXT: vmovaps %xmm1, %xmm0 +; SKX-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} ; SKX-NEXT: retq ; ; KNL-LABEL: test11b: ; KNL: # %bb.0: # %entry ; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm1 {%k1} -; KNL-NEXT: vmovaps %xmm1, %xmm0 +; KNL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} ; KNL-NEXT: retq entry: %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c - %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10 + %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10 ret <4 x float> %0 } @@ -305,3 +305,147 @@ define <8 x double> @test17(<8 x double> %a, <8 x double> %b, <8 x double> %c, i ret <8 x double> %res } declare <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <4 x float> @test18(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test18: +; SKX: # %bb.0: # %entry +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test18: +; KNL: # %bb.0: # %entry +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT: retq +entry: + %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b + %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 4) #10 + ret <4 x float> %0 +} + +define <4 x float> @test19(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test19: +; SKX: # %bb.0: # %entry +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test19: +; KNL: # %bb.0: # %entry +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT: retq +entry: + %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b + %sub.i.2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c + %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %sub.i.2, i8 %mask, i32 4) #10 + ret <4 x float> %0 +} + +define <4 x float> @test20(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test20: +; SKX: # %bb.0: # %entry +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1} +; SKX-NEXT: vmovaps %xmm2, %xmm0 +; SKX-NEXT: retq +; +; KNL-LABEL: test20: +; KNL: # %bb.0: # %entry +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1} +; KNL-NEXT: vmovaps %xmm2, %xmm0 +; KNL-NEXT: retq +entry: + %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b + %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 4) #10 + ret <4 x float> %0 +} + +define <4 x float> @test21(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test21: +; SKX: # %bb.0: # %entry +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test21: +; KNL: # %bb.0: # %entry +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT: retq +entry: + %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b + %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 8) #10 + ret <4 x float> %0 +} + +define <4 x float> @test22(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test22: +; SKX: # %bb.0: # %entry +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test22: +; KNL: # %bb.0: # %entry +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT: retq +entry: + %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b + %sub.i.2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c + %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %sub.i.2, i8 %mask, i32 8) #10 + ret <4 x float> %0 +} + +define <4 x float> @test23(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test23: +; SKX: # %bb.0: # %entry +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; SKX-NEXT: vmovaps %xmm2, %xmm0 +; SKX-NEXT: retq +; +; KNL-LABEL: test23: +; KNL: # %bb.0: # %entry +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; KNL-NEXT: vmovaps %xmm2, %xmm0 +; KNL-NEXT: retq +entry: + %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b + %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 8) #10 + ret <4 x float> %0 +} + +define <4 x float> @test24(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test24: +; SKX: # %bb.0: # %entry +; SKX-NEXT: kmovd %edi, %k1 +; SKX-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test24: +; KNL: # %bb.0: # %entry +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT: retq +entry: + %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c + %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 8) #10 + ret <4 x float> %0 +} + +define <16 x float> @test25(<16 x float> %a, <16 x float> %b, <16 x float> %c) { +; CHECK-LABEL: test25: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: retq +entry: + %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b + %sub.i.2 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c + %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a, <16 x float> %sub.i, <16 x float> %sub.i.2, i16 -1, i32 8) #2 + ret <16 x float> %0 +} diff --git a/test/CodeGen/X86/fmsubadd-combine.ll b/test/CodeGen/X86/fmsubadd-combine.ll index 814d61e22382..ca2c61a88507 100644 --- a/test/CodeGen/X86/fmsubadd-combine.ll +++ b/test/CodeGen/X86/fmsubadd-combine.ll @@ -8,26 +8,17 @@ define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 { ; FMA3_256-LABEL: mul_subadd_pd128: ; FMA3_256: # %bb.0: # %entry -; FMA3_256-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; FMA3_256-NEXT: vsubpd %xmm2, %xmm0, %xmm1 -; FMA3_256-NEXT: vaddpd %xmm2, %xmm0, %xmm0 -; FMA3_256-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; FMA3_256-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 ; FMA3_256-NEXT: retq ; ; FMA3_512-LABEL: mul_subadd_pd128: ; FMA3_512: # %bb.0: # %entry -; FMA3_512-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; FMA3_512-NEXT: vsubpd %xmm2, %xmm0, %xmm1 -; FMA3_512-NEXT: vaddpd %xmm2, %xmm0, %xmm0 -; FMA3_512-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; FMA3_512-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 ; FMA3_512-NEXT: retq ; ; FMA4-LABEL: mul_subadd_pd128: ; FMA4: # %bb.0: # %entry -; FMA4-NEXT: vmulpd %xmm1, %xmm0, %xmm0 -; FMA4-NEXT: vsubpd %xmm2, %xmm0, %xmm1 -; FMA4-NEXT: vaddpd %xmm2, %xmm0, %xmm0 -; FMA4-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 ; FMA4-NEXT: retq entry: %AB = fmul <2 x double> %A, %B @@ -40,18 +31,12 @@ entry: define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 { ; FMA3-LABEL: mul_subadd_ps128: ; FMA3: # %bb.0: # %entry -; FMA3-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA3-NEXT: vsubps %xmm2, %xmm0, %xmm1 -; FMA3-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; FMA3-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; FMA3-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 ; FMA3-NEXT: retq ; ; FMA4-LABEL: mul_subadd_ps128: ; FMA4: # %bb.0: # %entry -; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; FMA4-NEXT: vsubps %xmm2, %xmm0, %xmm1 -; FMA4-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; FMA4-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 ; FMA4-NEXT: retq entry: %AB = fmul <4 x float> %A, %B @@ -64,18 +49,12 @@ entry: define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 { ; FMA3-LABEL: mul_subadd_pd256: ; FMA3: # %bb.0: # %entry -; FMA3-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; FMA3-NEXT: vsubpd %ymm2, %ymm0, %ymm1 -; FMA3-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; FMA3-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; FMA3-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 ; FMA3-NEXT: retq ; ; FMA4-LABEL: mul_subadd_pd256: ; FMA4: # %bb.0: # %entry -; FMA4-NEXT: vmulpd %ymm1, %ymm0, %ymm0 -; FMA4-NEXT: vsubpd %ymm2, %ymm0, %ymm1 -; FMA4-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; FMA4-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 ; FMA4-NEXT: retq entry: %AB = fmul <4 x double> %A, %B @@ -88,18 +67,12 @@ entry: define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 { ; FMA3-LABEL: mul_subadd_ps256: ; FMA3: # %bb.0: # %entry -; FMA3-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; FMA3-NEXT: vsubps %ymm2, %ymm0, %ymm1 -; FMA3-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; FMA3-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; FMA3-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 ; FMA3-NEXT: retq ; ; FMA4-LABEL: mul_subadd_ps256: ; FMA4: # %bb.0: # %entry -; FMA4-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; FMA4-NEXT: vsubps %ymm2, %ymm0, %ymm1 -; FMA4-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; FMA4-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 ; FMA4-NEXT: retq entry: %AB = fmul <8 x float> %A, %B @@ -112,34 +85,19 @@ entry: define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 { ; FMA3_256-LABEL: mul_subadd_pd512: ; FMA3_256: # %bb.0: # %entry -; FMA3_256-NEXT: vmulpd %ymm2, %ymm0, %ymm0 -; FMA3_256-NEXT: vmulpd %ymm3, %ymm1, %ymm1 -; FMA3_256-NEXT: vsubpd %ymm5, %ymm1, %ymm2 -; FMA3_256-NEXT: vsubpd %ymm4, %ymm0, %ymm3 -; FMA3_256-NEXT: vaddpd %ymm5, %ymm1, %ymm1 -; FMA3_256-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] -; FMA3_256-NEXT: vaddpd %ymm4, %ymm0, %ymm0 -; FMA3_256-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; FMA3_256-NEXT: vfmsubadd213pd %ymm4, %ymm2, %ymm0 +; FMA3_256-NEXT: vfmsubadd213pd %ymm5, %ymm3, %ymm1 ; FMA3_256-NEXT: retq ; ; FMA3_512-LABEL: mul_subadd_pd512: ; FMA3_512: # %bb.0: # %entry -; FMA3_512-NEXT: vmulpd %zmm1, %zmm0, %zmm0 -; FMA3_512-NEXT: vsubpd %zmm2, %zmm0, %zmm1 -; FMA3_512-NEXT: vaddpd %zmm2, %zmm0, %zmm0 -; FMA3_512-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[7] +; FMA3_512-NEXT: vfmsubadd213pd %zmm2, %zmm1, %zmm0 ; FMA3_512-NEXT: retq ; ; FMA4-LABEL: mul_subadd_pd512: ; FMA4: # %bb.0: # %entry -; FMA4-NEXT: vmulpd %ymm2, %ymm0, %ymm0 -; FMA4-NEXT: vmulpd %ymm3, %ymm1, %ymm1 -; FMA4-NEXT: vsubpd %ymm5, %ymm1, %ymm2 -; FMA4-NEXT: vsubpd %ymm4, %ymm0, %ymm3 -; FMA4-NEXT: vaddpd %ymm5, %ymm1, %ymm1 -; FMA4-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] -; FMA4-NEXT: vaddpd %ymm4, %ymm0, %ymm0 -; FMA4-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; FMA4-NEXT: vfmsubaddpd %ymm4, %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vfmsubaddpd %ymm5, %ymm3, %ymm1, %ymm1 ; FMA4-NEXT: retq entry: %AB = fmul <8 x double> %A, %B @@ -152,35 +110,19 @@ entry: define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 { ; FMA3_256-LABEL: mul_subadd_ps512: ; FMA3_256: # %bb.0: # %entry -; FMA3_256-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA3_256-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA3_256-NEXT: vsubps %ymm5, %ymm1, %ymm2 -; FMA3_256-NEXT: vsubps %ymm4, %ymm0, %ymm3 -; FMA3_256-NEXT: vaddps %ymm5, %ymm1, %ymm1 -; FMA3_256-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; FMA3_256-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; FMA3_256-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] +; FMA3_256-NEXT: vfmsubadd213ps %ymm4, %ymm2, %ymm0 +; FMA3_256-NEXT: vfmsubadd213ps %ymm5, %ymm3, %ymm1 ; FMA3_256-NEXT: retq ; ; FMA3_512-LABEL: mul_subadd_ps512: ; FMA3_512: # %bb.0: # %entry -; FMA3_512-NEXT: vmulps %zmm1, %zmm0, %zmm1 -; FMA3_512-NEXT: vaddps %zmm2, %zmm1, %zmm0 -; FMA3_512-NEXT: movw $-21846, %ax # imm = 0xAAAA -; FMA3_512-NEXT: kmovw %eax, %k1 -; FMA3_512-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} +; FMA3_512-NEXT: vfmsubadd213ps %zmm2, %zmm1, %zmm0 ; FMA3_512-NEXT: retq ; ; FMA4-LABEL: mul_subadd_ps512: ; FMA4: # %bb.0: # %entry -; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; FMA4-NEXT: vsubps %ymm5, %ymm1, %ymm2 -; FMA4-NEXT: vsubps %ymm4, %ymm0, %ymm3 -; FMA4-NEXT: vaddps %ymm5, %ymm1, %ymm1 -; FMA4-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; FMA4-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; FMA4-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] +; FMA4-NEXT: vfmsubaddps %ymm4, %ymm2, %ymm0, %ymm0 +; FMA4-NEXT: vfmsubaddps %ymm5, %ymm3, %ymm1, %ymm1 ; FMA4-NEXT: retq entry: %AB = fmul <16 x float> %A, %B diff --git a/test/CodeGen/X86/fold-vector-sext-crash.ll b/test/CodeGen/X86/fold-vector-sext-crash.ll index 481f55e9e10d..db73195698e3 100644 --- a/test/CodeGen/X86/fold-vector-sext-crash.ll +++ b/test/CodeGen/X86/fold-vector-sext-crash.ll @@ -9,9 +9,9 @@ define <4 x i64> @foo(<4 x i64> %A) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vmovdqa %xmm1, %xmm1 -; CHECK-NEXT: vandps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295,0,0,0,0] +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 ; CHECK-NEXT: retl %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i64> %A, <4 x i64><i64 undef, i64 undef, i64 0, i64 0> ret <4 x i64> %1 diff --git a/test/CodeGen/X86/horizontal-reduce-smax.ll b/test/CodeGen/X86/horizontal-reduce-smax.ll index a54e01d9af67..fa92158ae92d 100644 --- a/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -309,30 +309,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v16i8: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: test_reduce_v16i8: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX-NEXT: retl @@ -371,30 +366,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v16i8: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: test_reduce_v16i8: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX-NEXT: retq @@ -906,16 +896,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE42-LABEL: test_reduce_v32i8: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -924,14 +911,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -940,15 +925,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -994,16 +977,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE42-LABEL: test_reduce_v32i8: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -1012,14 +992,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -1028,15 +1006,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -1045,15 +1021,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper @@ -1743,16 +1717,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1 ; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0 ; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -1764,14 +1735,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -1781,15 +1750,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -1847,16 +1814,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1 ; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0 ; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -1868,14 +1832,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -1885,15 +1847,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -1902,17 +1862,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper diff --git a/test/CodeGen/X86/horizontal-reduce-smin.ll b/test/CodeGen/X86/horizontal-reduce-smin.ll index f03e745598e6..fa5828a45700 100644 --- a/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -311,30 +311,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v16i8: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: test_reduce_v16i8: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX-NEXT: retl @@ -373,30 +368,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v16i8: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: test_reduce_v16i8: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX-NEXT: retq @@ -910,16 +900,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE42-LABEL: test_reduce_v32i8: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -928,14 +915,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -944,15 +929,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -998,16 +981,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE42-LABEL: test_reduce_v32i8: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -1016,14 +996,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -1032,15 +1010,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -1049,15 +1025,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper @@ -1745,16 +1719,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE42-NEXT: pminsb %xmm3, %xmm1 ; X86-SSE42-NEXT: pminsb %xmm2, %xmm0 ; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -1766,14 +1737,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -1783,15 +1752,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -1849,16 +1816,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE42-NEXT: pminsb %xmm3, %xmm1 ; X64-SSE42-NEXT: pminsb %xmm2, %xmm0 ; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -1870,14 +1834,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -1887,15 +1849,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -1904,17 +1864,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper diff --git a/test/CodeGen/X86/horizontal-reduce-umax.ll b/test/CodeGen/X86/horizontal-reduce-umax.ll index 52e623b82718..204479976e90 100644 --- a/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -362,30 +362,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v16i8: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: test_reduce_v16i8: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX-NEXT: retl @@ -408,30 +403,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v16i8: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: test_reduce_v16i8: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX-NEXT: retq @@ -1031,16 +1021,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE42-LABEL: test_reduce_v32i8: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -1049,14 +1036,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -1065,15 +1050,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -1099,16 +1082,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE42-LABEL: test_reduce_v32i8: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -1117,14 +1097,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -1133,15 +1111,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -1150,15 +1126,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper @@ -1992,16 +1966,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE42-NEXT: pmaxub %xmm3, %xmm1 ; X86-SSE42-NEXT: pmaxub %xmm2, %xmm0 ; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 -; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT: psrlw $8, %xmm2 +; X86-SSE42-NEXT: pminub %xmm0, %xmm2 +; X86-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pxor %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -2013,14 +1984,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -2030,15 +1999,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -2068,16 +2035,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE42-NEXT: pmaxub %xmm3, %xmm1 ; X64-SSE42-NEXT: pmaxub %xmm2, %xmm0 ; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 -; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT: psrlw $8, %xmm2 +; X64-SSE42-NEXT: pminub %xmm0, %xmm2 +; X64-SSE42-NEXT: phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pxor %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -2089,14 +2053,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -2106,15 +2068,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -2123,17 +2083,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper diff --git a/test/CodeGen/X86/horizontal-reduce-umin.ll b/test/CodeGen/X86/horizontal-reduce-umin.ll index 505663656a3a..2a37d17365be 100644 --- a/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -352,30 +352,19 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X86-SSE42-LABEL: test_reduce_v16i8: ; X86-SSE42: ## %bb.0: -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 ; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: test_reduce_v16i8: ; X86-AVX: ## %bb.0: -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX-NEXT: retl @@ -398,30 +387,19 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) { ; ; X64-SSE42-LABEL: test_reduce_v16i8: ; X64-SSE42: ## %bb.0: -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 ; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: test_reduce_v16i8: ; X64-AVX: ## %bb.0: -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX-NEXT: retq @@ -1004,16 +982,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-SSE42-LABEL: test_reduce_v32i8: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminub %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 ; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -1022,14 +994,9 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -1038,15 +1005,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X86-AVX2-LABEL: test_reduce_v32i8: ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -1072,16 +1034,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-SSE42-LABEL: test_reduce_v32i8: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminub %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 ; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -1090,14 +1046,9 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -1106,15 +1057,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-AVX2-LABEL: test_reduce_v32i8: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -1123,15 +1069,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) { ; X64-AVX512-LABEL: test_reduce_v32i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper @@ -1942,16 +1883,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-SSE42-NEXT: pminub %xmm3, %xmm1 ; X86-SSE42-NEXT: pminub %xmm2, %xmm0 ; X86-SSE42-NEXT: pminub %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 ; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrld $16, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 ; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X86-SSE42-NEXT: retl @@ -1963,14 +1898,9 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX1-NEXT: vzeroupper @@ -1980,15 +1910,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X86-AVX2-NEXT: vzeroupper @@ -2018,16 +1943,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-SSE42-NEXT: pminub %xmm3, %xmm1 ; X64-SSE42-NEXT: pminub %xmm2, %xmm0 ; X64-SSE42-NEXT: pminub %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 ; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrld $16, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 ; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax ; X64-SSE42-NEXT: retq @@ -2039,14 +1958,9 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2 ; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX1-NEXT: vzeroupper @@ -2056,15 +1970,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX2-NEXT: vzeroupper @@ -2073,17 +1982,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) { ; X64-AVX512-LABEL: test_reduce_v64i8: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax ; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax ; X64-AVX512-NEXT: vzeroupper diff --git a/test/CodeGen/X86/known-bits-vector.ll b/test/CodeGen/X86/known-bits-vector.ll index 283d1f93dfb6..46a888f3b9b6 100644 --- a/test/CodeGen/X86/known-bits-vector.ll +++ b/test/CodeGen/X86/known-bits-vector.ll @@ -160,17 +160,19 @@ define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind { define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind { ; X32-LABEL: knownbits_mask_or_shuffle_uitofp: ; X32: # %bb.0: -; X32-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vorps {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0 +; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X32-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] +; X32-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_or_shuffle_uitofp: ; X64: # %bb.0: -; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] +; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085> diff --git a/test/CodeGen/X86/machinesink-merge-debuginfo.ll b/test/CodeGen/X86/machinesink-merge-debuginfo.ll index d8fcea1872e8..f5023bbeb5f9 100644 --- a/test/CodeGen/X86/machinesink-merge-debuginfo.ll +++ b/test/CodeGen/X86/machinesink-merge-debuginfo.ll @@ -5,6 +5,21 @@ source_filename = "test-sink-debug.cpp" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" +; double foo(double x, double y, bool c) { +; double a = x / 3.0; +; double b = y / 5.0; +; double ret; +; +; if (c) +; ret = a + 1.0; +; else +; ret = b + 1.0; +; +; ret = ret + 1.0; +; +; return ret; +; } + ; Function Attrs: nounwind readnone uwtable define double @_Z3fooddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr !dbg !7 { tail call void @llvm.dbg.value(metadata double %x, metadata !13, metadata !DIExpression()), !dbg !16 @@ -17,9 +32,10 @@ first: %e = fadd double %a, 1.000000e+00 br label %final second: - %f = fadd double %b, 1.000000e+00, !dbg !17 -; CHECK: debug-location !17 -; CHECK: debug-location !17 +; CHECK-NOT: debug-location !17 +; CHECK: debug-location !18 +; CHECK-NOT: debug-location !17 + %f = fadd double %b, 1.000000e+00, !dbg !18 br label %final final: %cond = phi double [%e, %first], [%f, %second] @@ -27,15 +43,39 @@ final: ret double %d } -; Function Attrs: nounwind readnone speculatable -declare void @llvm.dbg.value(metadata, metadata, metadata) #1 -attributes #1 = { nounwind readnone speculatable } + +; Function Attrs: nounwind readnone uwtable +define double @_Z4foo1ddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr !dbg !19 { + tail call void @llvm.dbg.value(metadata double %x, metadata !21, metadata !DIExpression()), !dbg !24 + tail call void @llvm.dbg.value(metadata double %y, metadata !22, metadata !DIExpression()), !dbg !24 + tail call void @llvm.dbg.value(metadata i1 %c, metadata !23, metadata !DIExpression()), !dbg !24 + %a = fdiv double %x, 3.000000e+00 + %b = fdiv double %y, 5.000000e+00, !dbg !25 + br i1 %c, label %first, label %second +first: + %e = fadd double %a, 1.000000e+00 + br label %final +second: + %f = fadd double %b, 1.000000e+00, !dbg !25 +; CHECK: debug-location !25 +; CHECK-NEXT: debug-location !25 + br label %final +final: + %cond = phi double [%e, %first], [%f, %second] + %d = fadd double %cond, 1.000000e+00 + ret double %d +} !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} !llvm.ident = !{!6} +attributes #1 = { nounwind readnone speculatable } + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) #1 + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 313291)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) !1 = !DIFile(filename: "test-sink-debug.cpp", directory: "/tmp") !2 = !{} @@ -54,3 +94,11 @@ attributes #1 = { nounwind readnone speculatable } !15 = !DILocalVariable(name: "c", arg: 3, scope: !7, file: !1, line: 1, type: !11) !16 = !DILocation(line: 1, column: 19, scope: !7) !17 = !DILocation(line: 2, column: 26, scope: !7) +!18 = !DILocation(line: 3, column: 20, scope: !7) +!19 = distinct !DISubprogram(name: "foo1", linkageName: "_Z4foo1ddb", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !20) +!20 = !{!21, !22, !23} +!21 = !DILocalVariable(name: "x", arg: 1, scope: !19, file: !1, line: 1, type: !10) +!22 = !DILocalVariable(name: "y", arg: 2, scope: !19, file: !1, line: 1, type: !10) +!23 = !DILocalVariable(name: "c", arg: 3, scope: !19, file: !1, line: 1, type: !11) +!24 = !DILocation(line: 1, column: 19, scope: !19) +!25 = !DILocation(line: 2, column: 26, scope: !19) diff --git a/test/CodeGen/X86/machinesink-null-debuginfo.ll b/test/CodeGen/X86/machinesink-null-debuginfo.ll index 454e0cd704ff..c0399b3cfa81 100644 --- a/test/CodeGen/X86/machinesink-null-debuginfo.ll +++ b/test/CodeGen/X86/machinesink-null-debuginfo.ll @@ -11,10 +11,10 @@ define double @_Z3fooddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr tail call void @llvm.dbg.value(metadata double %y, metadata !14, metadata !DIExpression()), !dbg !17 tail call void @llvm.dbg.value(metadata i1 %c, metadata !15, metadata !DIExpression()), !dbg !18 %a = fdiv double %x, 3.000000e+00 - %b = fdiv double %y, 5.000000e+00, !dbg !21 + %b = fdiv double %y, 5.000000e+00, !dbg !19 %cond = select i1 %c, double %a, double %b -; CHECK-NOT: debug-location !21 - ret double %cond, !dbg !22 +; CHECK-NOT: debug-location !19 + ret double %cond, !dbg !20 } ; Function Attrs: nounwind readnone speculatable @@ -45,5 +45,5 @@ attributes #1 = { nounwind readnone speculatable } !16 = !DILocation(line: 1, column: 19, scope: !7) !17 = !DILocation(line: 1, column: 29, scope: !7) !18 = !DILocation(line: 1, column: 37, scope: !7) -!21 = !DILocation(line: 2, column: 26, scope: !7) -!22 = !DILocation(line: 2, column: 3, scope: !7) +!19 = !DILocation(line: 2, column: 26, scope: !7) +!20 = !DILocation(line: 2, column: 3, scope: !7) diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll index 1eb2631e26ef..d318dde34434 100644 --- a/test/CodeGen/X86/masked_gather_scatter.ll +++ b/test/CodeGen/X86/masked_gather_scatter.ll @@ -1500,7 +1500,8 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { ; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0 ; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0 ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,1,0] +; KNL_32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; KNL_32-NEXT: vmovdqa %xmm2, %xmm2 ; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} @@ -1596,7 +1597,8 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) { ; KNL_32-NEXT: vpsllq $32, %xmm1, %xmm1 ; KNL_32-NEXT: vpsraq $32, %zmm1, %zmm1 ; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,1,0] +; KNL_32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; KNL_32-NEXT: vmovdqa %xmm2, %xmm2 ; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -2606,56 +2608,32 @@ define <2 x float> @large_index(float* %base, <2 x i128> %ind, <2 x i1> %mask, < define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) { ; KNL_64-LABEL: sext_i8_index: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpmovsxbw %xmm0, %ymm0 -; KNL_64-NEXT: vpmovsxwq %xmm0, %zmm1 -; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL_64-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 -; KNL_64-NEXT: kxnorw %k0, %k0, %k2 -; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2} -; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} -; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: sext_i8_index: ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpmovsxbw %xmm0, %ymm0 -; KNL_32-NEXT: vpmovsxwq %xmm0, %zmm1 -; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL_32-NEXT: vpmovsxwq %xmm0, %zmm0 +; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 -; KNL_32-NEXT: kxnorw %k0, %k0, %k2 -; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2} -; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} -; KNL_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: sext_i8_index: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsxbw %xmm0, %ymm0 -; SKX-NEXT: vpmovsxwq %xmm0, %zmm1 -; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT: vpmovsxwq %xmm0, %zmm0 +; SKX-NEXT: vpmovsxbd %xmm0, %zmm1 ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kxnorw %k0, %k0, %k2 -; SKX-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2} -; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} -; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: sext_i8_index: ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpmovsxbw %xmm0, %ymm0 -; SKX_32-NEXT: vpmovsxwq %xmm0, %zmm1 -; SKX_32-NEXT: vextracti128 $1, %ymm0, %xmm0 -; SKX_32-NEXT: vpmovsxwq %xmm0, %zmm0 +; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: kxnorw %k0, %k0, %k2 -; SKX_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2} -; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} -; SKX_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; SKX_32-NEXT: retl %sext_ind = sext <16 x i8> %ind to <16 x i64> @@ -2669,40 +2647,42 @@ define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) { define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) { ; KNL_64-LABEL: sext_v8i8_index: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; KNL_64-NEXT: vpsllq $56, %zmm0, %zmm0 -; KNL_64-NEXT: vpsraq $56, %zmm0, %zmm1 +; KNL_64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpslld $24, %ymm0, %ymm0 +; KNL_64-NEXT: vpsrad $24, %ymm0, %ymm0 +; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1 ; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: sext_v8i8_index: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; KNL_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vpsllq $56, %zmm0, %zmm0 -; KNL_32-NEXT: vpsraq $56, %zmm0, %zmm1 +; KNL_32-NEXT: vpslld $24, %ymm0, %ymm0 +; KNL_32-NEXT: vpsrad $24, %ymm0, %ymm0 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1 ; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: sext_v8i8_index: ; SKX: # %bb.0: -; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; SKX-NEXT: vpsllq $56, %zmm0, %zmm0 -; SKX-NEXT: vpsraq $56, %zmm0, %zmm1 +; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} +; SKX-NEXT: vpslld $24, %ymm0, %ymm0 +; SKX-NEXT: vpsrad $24, %ymm0, %ymm1 +; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: sext_v8i8_index: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; SKX_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vpsllq $56, %zmm0, %zmm0 -; SKX_32-NEXT: vpsraq $56, %zmm0, %zmm1 +; SKX_32-NEXT: vpslld $24, %ymm0, %ymm0 +; SKX_32-NEXT: vpsrad $24, %ymm0, %ymm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} +; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} ; SKX_32-NEXT: retl %sext_ind = sext <8 x i8> %ind to <8 x i64> @@ -2764,3 +2744,48 @@ define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> } declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>) +define <16 x float> @zext_index(float* %base, <16 x i32> %ind) { +; KNL_64-LABEL: zext_index: +; KNL_64: # %bb.0: +; KNL_64-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 +; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: zext_index: +; KNL_32: # %bb.0: +; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vpandd {{\.LCPI.*}}{1to16}, %zmm0, %zmm1 +; KNL_32-NEXT: kxnorw %k0, %k0, %k1 +; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} +; KNL_32-NEXT: retl +; +; SKX_SMALL-LABEL: zext_index: +; SKX_SMALL: # %bb.0: +; SKX_SMALL-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm1 +; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 +; SKX_SMALL-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; SKX_SMALL-NEXT: retq +; +; SKX_LARGE-LABEL: zext_index: +; SKX_LARGE: # %bb.0: +; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax +; SKX_LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm1 +; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 +; SKX_LARGE-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; SKX_LARGE-NEXT: retq +; +; SKX_32-LABEL: zext_index: +; SKX_32: # %bb.0: +; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vandps {{\.LCPI.*}}{1to16}, %zmm0, %zmm1 +; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} +; SKX_32-NEXT: retl + %ind_masked = and <16 x i32> %ind, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> + %sext_ind = zext <16 x i32> %ind_masked to <16 x i64> + %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind + + %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) + ret <16 x float>%res +} diff --git a/test/CodeGen/X86/popcnt.ll b/test/CodeGen/X86/popcnt.ll index d7622c8d0cab..f7b9ea9b8b2e 100644 --- a/test/CodeGen/X86/popcnt.ll +++ b/test/CodeGen/X86/popcnt.ll @@ -78,7 +78,7 @@ define i16 @cnt16(i16 %x) nounwind readnone { ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: shll $8, %eax ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %ah, %eax +; X32-NEXT: movzbl %ah, %eax # NOREX ; X32-NEXT: # kill: def %ax killed %ax killed %eax ; X32-NEXT: retl ; diff --git a/test/CodeGen/X86/prefetch.ll b/test/CodeGen/X86/prefetch.ll index 17a9ac994a79..839948174a43 100644 --- a/test/CodeGen/X86/prefetch.ll +++ b/test/CodeGen/X86/prefetch.ll @@ -1,27 +1,101 @@ -; RUN: llc < %s -mtriple=i686-- -mattr=+sse | FileCheck %s -; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s -; RUN: llc < %s -mtriple=i686-- -mattr=+sse -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHW -; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=SLM -; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHW -; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=NOPRFCHW +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-- -mattr=+sse | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prfchw | FileCheck %s -check-prefix=PRFCHWSSE +; RUN: llc < %s -mtriple=i686-- -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHWSSE +; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=PRFCHWSSE +; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHWSSE +; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=SSE +; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1 +; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1 +; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+3dnow,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1 +; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s -check-prefix=3DNOW +; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow,+prfchw | FileCheck %s -check-prefix=PRFCHW3DNOW + +; Rules: +; 3dnow by itself get you just the single prefetch instruction with no hints +; sse provides prefetch0/1/2/nta +; supporting prefetchw, but not 3dnow implicitly provides prefetcht0/1/2/nta regardless of sse setting as we need something to fall back to for the non-write hint. +; supporting prefetchwt1 implies prefetcht0/1/2/nta and prefetchw regardless of other settings. this allows levels for non-write and gives us an instruction for write+T0 +; 3dnow prefetch instruction will only get used if you have no other prefetch instructions enabled ; rdar://10538297 define void @t(i8* %ptr) nounwind { +; SSE-LABEL: t: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT: prefetcht2 (%eax) +; SSE-NEXT: prefetcht1 (%eax) +; SSE-NEXT: prefetcht0 (%eax) +; SSE-NEXT: prefetchnta (%eax) +; SSE-NEXT: prefetcht2 (%eax) +; SSE-NEXT: prefetcht1 (%eax) +; SSE-NEXT: prefetcht0 (%eax) +; SSE-NEXT: prefetchnta (%eax) +; SSE-NEXT: retl +; +; PRFCHWSSE-LABEL: t: +; PRFCHWSSE: # %bb.0: # %entry +; PRFCHWSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; PRFCHWSSE-NEXT: prefetcht2 (%eax) +; PRFCHWSSE-NEXT: prefetcht1 (%eax) +; PRFCHWSSE-NEXT: prefetcht0 (%eax) +; PRFCHWSSE-NEXT: prefetchnta (%eax) +; PRFCHWSSE-NEXT: prefetchw (%eax) +; PRFCHWSSE-NEXT: prefetchw (%eax) +; PRFCHWSSE-NEXT: prefetchw (%eax) +; PRFCHWSSE-NEXT: prefetchw (%eax) +; PRFCHWSSE-NEXT: retl +; +; PREFETCHWT1-LABEL: t: +; PREFETCHWT1: # %bb.0: # %entry +; PREFETCHWT1-NEXT: movl {{[0-9]+}}(%esp), %eax +; PREFETCHWT1-NEXT: prefetcht2 (%eax) +; PREFETCHWT1-NEXT: prefetcht1 (%eax) +; PREFETCHWT1-NEXT: prefetcht0 (%eax) +; PREFETCHWT1-NEXT: prefetchnta (%eax) +; PREFETCHWT1-NEXT: prefetchwt1 (%eax) +; PREFETCHWT1-NEXT: prefetchwt1 (%eax) +; PREFETCHWT1-NEXT: prefetchw (%eax) +; PREFETCHWT1-NEXT: prefetchwt1 (%eax) +; PREFETCHWT1-NEXT: retl +; +; 3DNOW-LABEL: t: +; 3DNOW: # %bb.0: # %entry +; 3DNOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; 3DNOW-NEXT: prefetch (%eax) +; 3DNOW-NEXT: prefetch (%eax) +; 3DNOW-NEXT: prefetch (%eax) +; 3DNOW-NEXT: prefetch (%eax) +; 3DNOW-NEXT: prefetch (%eax) +; 3DNOW-NEXT: prefetch (%eax) +; 3DNOW-NEXT: prefetch (%eax) +; 3DNOW-NEXT: prefetch (%eax) +; 3DNOW-NEXT: retl +; +; PRFCHW3DNOW-LABEL: t: +; PRFCHW3DNOW: # %bb.0: # %entry +; PRFCHW3DNOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; PRFCHW3DNOW-NEXT: prefetch (%eax) +; PRFCHW3DNOW-NEXT: prefetch (%eax) +; PRFCHW3DNOW-NEXT: prefetch (%eax) +; PRFCHW3DNOW-NEXT: prefetch (%eax) +; PRFCHW3DNOW-NEXT: prefetchw (%eax) +; PRFCHW3DNOW-NEXT: prefetchw (%eax) +; PRFCHW3DNOW-NEXT: prefetchw (%eax) +; PRFCHW3DNOW-NEXT: prefetchw (%eax) +; PRFCHW3DNOW-NEXT: retl entry: -; CHECK: prefetcht2 -; CHECK: prefetcht1 -; CHECK: prefetcht0 -; CHECK: prefetchnta -; PRFCHW: prefetchw -; NOPRFCHW-NOT: prefetchw -; SLM: prefetchw tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 1, i32 1 ) tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 2, i32 1 ) tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 ) tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 0, i32 1 ) + tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 1, i32 1 ) + tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 2, i32 1 ) tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 3, i32 1 ) + tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 0, i32 1 ) ret void } -declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind +declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/test/CodeGen/X86/shuffle-strided-with-offset-128.ll index 0f1f818e250d..2df115e2f9e4 100644 --- a/test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ b/test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -2,7 +2,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW @@ -363,12 +364,26 @@ define void @shuffle_v8i16_to_v2i16_1(<8 x i16>* %L, <2 x i16>* %S) nounwind { ; SSE-NEXT: movd %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_to_v2i16_1: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_to_v2i16_1: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_1: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_1: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1: ; AVX512F: # %bb.0: @@ -409,12 +424,26 @@ define void @shuffle_v8i16_to_v2i16_2(<8 x i16>* %L, <2 x i16>* %S) nounwind { ; SSE-NEXT: movd %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_to_v2i16_2: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_to_v2i16_2: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2: ; AVX512F: # %bb.0: @@ -455,12 +484,26 @@ define void @shuffle_v8i16_to_v2i16_3(<8 x i16>* %L, <2 x i16>* %S) nounwind { ; SSE-NEXT: movd %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_to_v2i16_3: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_to_v2i16_3: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_3: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_3: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3: ; AVX512F: # %bb.0: diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/test/CodeGen/X86/shuffle-strided-with-offset-256.ll index 7cef269ebc2b..081c962ab94a 100644 --- a/test/CodeGen/X86/shuffle-strided-with-offset-256.ll +++ b/test/CodeGen/X86/shuffle-strided-with-offset-256.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW @@ -362,18 +363,30 @@ define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_to_v4i16_1: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1: ; AVX512F: # %bb.0: @@ -446,18 +459,30 @@ define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_to_v4i16_2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2: ; AVX512F: # %bb.0: @@ -522,18 +547,30 @@ define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_to_v4i16_3: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3: ; AVX512F: # %bb.0: @@ -633,11 +670,9 @@ define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) ; AVX512VL-NEXT: vzeroupper @@ -659,11 +694,9 @@ define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper @@ -795,11 +828,9 @@ define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7] ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) ; AVX512VL-NEXT: vzeroupper @@ -821,11 +852,9 @@ define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7] ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7] ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper @@ -949,14 +978,9 @@ define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) ; AVX512VL-NEXT: vzeroupper @@ -978,14 +1002,9 @@ define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/test/CodeGen/X86/shuffle-strided-with-offset-512.ll index 7f3431fabedc..a5d0a7fa401b 100644 --- a/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -99,10 +99,9 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,17,19,21,23,9,11,13,15,25,27,29,31] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3] -; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L @@ -673,17 +672,14 @@ define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11] ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,9,9,8,8,9,9,8,8,9,9,10,10,11,11] ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) @@ -831,17 +827,14 @@ define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11] ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [3,3,11,11,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) @@ -989,24 +982,14 @@ define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [5,5,13,13,4,4,5,5,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) @@ -1154,17 +1137,14 @@ define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [14,14,15,15,6,6,7,7,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7] ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,15,15,6,6,7,7,4,4,5,5,6,6,7,7] ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) diff --git a/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/test/CodeGen/X86/shuffle-vs-trunc-128.ll index 1bfe37b1497e..f2cd81d3d937 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -2,7 +2,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW @@ -436,12 +437,26 @@ define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { ; SSE-NEXT: movd %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_to_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_to_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: shuffle_v8i16_to_v2i16: ; AVX512F: # %bb.0: @@ -482,12 +497,26 @@ define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { ; SSE-NEXT: movd %xmm0, (%rsi) ; SSE-NEXT: retq ; -; AVX-LABEL: trunc_v2i64_to_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_v2i64_to_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vmovd %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v2i64_to_v2i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_v2i64_to_v2i16: ; AVX512F: # %bb.0: diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 59a8aa47246c..5b46c73a3f65 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW @@ -139,59 +140,17 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: shuffle_v16i16_to_v8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_to_v8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i16_to_v8i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v16i16_to_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rsi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> store <8 x i16> %strided.vec, <8 x i16>* %S @@ -290,13 +249,21 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_v4i64_to_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vmovaps %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsi) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i32: ; AVX512F: # %bb.0: @@ -399,12 +366,9 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper @@ -490,18 +454,30 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_to_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: shuffle_v16i16_to_v4i16: ; AVX512F: # %bb.0: @@ -563,14 +539,23 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_v4i64_to_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i16: ; AVX512F: # %bb.0: @@ -693,14 +678,23 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_v4i64_to_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vmovd %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd (%rdi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_v4i64_to_v4i8: ; AVX512F: # %bb.0: diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll index 3fa148405f6b..4577cd8f8b44 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -148,10 +148,9 @@ define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] ; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3] -; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i16>, <32 x i16>* %L diff --git a/test/CodeGen/X86/var-permute-256.ll b/test/CodeGen/X86/var-permute-256.ll index 82a790298f23..3baab2476d40 100644 --- a/test/CodeGen/X86/var-permute-256.ll +++ b/test/CodeGen/X86/var-permute-256.ll @@ -142,14 +142,13 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind { ; AVX1-NEXT: andl $7, %r10d ; AVX1-NEXT: andl $28, %edi ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: movq %rsp, %rax -; AVX1-NEXT: vpinsrd $1, (%rdx,%rax), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $1, (%rsp,%rdx), %xmm0, %xmm0 ; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $3, (%rdi,%rax), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $3, (%rsp,%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vpinsrd $1, (%rsi,%rax), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrd $1, (%rsp,%rsi), %xmm1, %xmm1 ; AVX1-NEXT: vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrd $3, (%rcx,%rax), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrd $3, (%rsp,%rcx), %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: movq %rbp, %rsp ; AVX1-NEXT: popq %rbp @@ -505,118 +504,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX1-NEXT: vpextrb $0, %xmm2, %eax ; AVX1-NEXT: vmovaps %ymm0, (%rsp) ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movq %rsp, %rcx -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpextrb $1, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $2, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $3, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $4, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $5, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $6, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $7, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $8, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $9, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $10, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $11, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $12, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $13, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $14, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $15, %xmm2, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX1-NEXT: vpextrb $0, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vmovd %eax, %xmm2 ; AVX1-NEXT: vpextrb $1, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $2, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $3, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $4, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $5, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $6, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $7, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $8, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $9, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $10, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $11, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $12, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $13, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $14, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT: vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2 ; AVX1-NEXT: vpextrb $15, %xmm1, %eax ; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: movzbl (%rax,%rcx), %eax +; AVX1-NEXT: movzbl (%rsp,%rax), %eax ; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: movq %rbp, %rsp @@ -633,118 +631,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX2-NEXT: vpextrb $0, %xmm2, %eax ; AVX2-NEXT: vmovaps %ymm0, (%rsp) ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movq %rsp, %rcx -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vpextrb $1, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $2, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $3, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $4, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $5, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $6, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $7, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $8, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $9, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $10, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $11, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $12, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $13, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $14, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $15, %xmm2, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vmovd %eax, %xmm2 ; AVX2-NEXT: vpextrb $1, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $2, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $3, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $4, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $5, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $6, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $7, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $8, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $9, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $10, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $11, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $12, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $13, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $14, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT: vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2 ; AVX2-NEXT: vpextrb $15, %xmm1, %eax ; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: movzbl (%rax,%rcx), %eax +; AVX2-NEXT: movzbl (%rsp,%rax), %eax ; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: movq %rbp, %rsp @@ -761,118 +758,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX512F-NEXT: vpextrb $0, %xmm2, %eax ; AVX512F-NEXT: vmovaps %ymm0, (%rsp) ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movq %rsp, %rcx -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vpextrb $1, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $2, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $3, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $4, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $5, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $6, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $7, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $8, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $9, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $10, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $11, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $12, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $13, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $14, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $15, %xmm2, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512F-NEXT: vpextrb $0, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vmovd %eax, %xmm2 ; AVX512F-NEXT: vpextrb $1, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $2, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $3, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $4, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $5, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $6, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $7, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $8, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $9, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $10, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $11, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $12, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $13, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $14, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT: vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2 ; AVX512F-NEXT: vpextrb $15, %xmm1, %eax ; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: movzbl (%rax,%rcx), %eax +; AVX512F-NEXT: movzbl (%rsp,%rax), %eax ; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-NEXT: movq %rbp, %rsp @@ -889,118 +885,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind { ; AVX512VL-NEXT: vpextrb $0, %xmm2, %eax ; AVX512VL-NEXT: vmovaps %ymm0, (%rsp) ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movq %rsp, %rcx -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vmovd %eax, %xmm0 ; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vmovd %eax, %xmm2 ; AVX512VL-NEXT: vpextrb $1, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $2, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $3, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $4, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $5, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $6, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $7, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $8, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $9, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $10, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $11, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $12, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $13, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $14, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT: vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2 ; AVX512VL-NEXT: vpextrb $15, %xmm1, %eax ; AVX512VL-NEXT: andl $31, %eax -; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT: movzbl (%rsp,%rax), %eax ; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-NEXT: movq %rbp, %rsp @@ -1240,7 +1235,6 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi ; AVX1-NEXT: andl $7, %r10d ; AVX1-NEXT: andl $28, %edi ; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: movq %rsp, %rax ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] diff --git a/test/CodeGen/X86/var-permute-512.ll b/test/CodeGen/X86/var-permute-512.ll index a5aa73cdf1a2..3f9f96b008c5 100644 --- a/test/CodeGen/X86/var-permute-512.ll +++ b/test/CodeGen/X86/var-permute-512.ll @@ -511,265 +511,201 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind { ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, (%rsp) -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: movzbl (%rax,%rcx), %eax -; NOBW-NEXT: vpextrb $1, %xmm4, %ecx -; NOBW-NEXT: andl $63, %ecx +; NOBW-NEXT: movzbl 3008(%rsp,%rax), %eax ; NOBW-NEXT: vmovd %eax, %xmm0 -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; NOBW-NEXT: vpinsrb $1, (%rcx,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpextrb $1, %xmm4, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $1, 2944(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $2, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $2, 2880(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $3, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $3, 2816(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $4, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $4, 2752(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $5, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $5, 2688(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $6, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $6, 2624(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $7, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $7, 2560(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $8, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $8, 2496(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $9, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $9, 2432(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $10, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $10, 2368(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $11, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $11, 2304(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $12, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $12, 2240(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $13, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $13, 2176(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $14, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $14, 2112(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $15, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $15, 2048(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $0, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: movzbl (%rax,%rcx), %eax -; NOBW-NEXT: vpextrb $1, %xmm2, %ecx -; NOBW-NEXT: andl $63, %ecx +; NOBW-NEXT: movzbl 4032(%rsp,%rax), %eax ; NOBW-NEXT: vmovd %eax, %xmm1 -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; NOBW-NEXT: vpinsrb $1, (%rcx,%rax), %xmm1, %xmm1 +; NOBW-NEXT: vpextrb $1, %xmm2, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $1, 3968(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $2, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $2, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $2, 3904(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $3, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $3, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $3, 3840(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $4, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $4, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $4, 3776(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $5, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $5, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $5, 3712(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $6, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $6, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $6, 3648(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $7, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $7, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $7, 3584(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $8, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $8, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $8, 3520(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $9, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $9, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $9, 3456(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $10, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $10, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $10, 3392(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $11, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $11, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $11, 3328(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $12, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $12, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $12, 3264(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $13, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $13, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $13, 3200(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $14, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $14, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $14, 3136(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $15, %xmm2, %eax ; NOBW-NEXT: vextracti128 $1, %ymm3, %xmm2 ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $15, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $15, 3072(%rsp,%rax), %xmm1, %xmm1 ; NOBW-NEXT: vpextrb $0, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: movzbl (%rax,%rcx), %eax -; NOBW-NEXT: vpextrb $1, %xmm2, %ecx -; NOBW-NEXT: andl $63, %ecx +; NOBW-NEXT: movzbl 960(%rsp,%rax), %eax ; NOBW-NEXT: vmovd %eax, %xmm4 -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; NOBW-NEXT: vpinsrb $1, (%rcx,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $1, %xmm2, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $1, 896(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $2, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $2, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $2, 832(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $3, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $3, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $3, 768(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $4, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $4, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $4, 704(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $5, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $5, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $5, 640(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $6, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $6, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $6, 576(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $7, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $7, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $7, 512(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $8, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $8, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $8, 448(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $9, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $9, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $9, 384(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $10, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $10, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $10, 320(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $11, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $11, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $11, 256(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $12, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $12, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $12, 192(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $13, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $13, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $13, 128(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $14, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $14, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $14, 64(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $15, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: movq %rsp, %rcx -; NOBW-NEXT: vpinsrb $15, (%rax,%rcx), %xmm4, %xmm2 +; NOBW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm4, %xmm2 ; NOBW-NEXT: vpextrb $0, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: movzbl (%rax,%rcx), %eax -; NOBW-NEXT: vpextrb $1, %xmm3, %ecx -; NOBW-NEXT: andl $63, %ecx +; NOBW-NEXT: movzbl 1984(%rsp,%rax), %eax ; NOBW-NEXT: vmovd %eax, %xmm4 -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rax -; NOBW-NEXT: vpinsrb $1, (%rcx,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $1, %xmm3, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $1, 1920(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $2, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $2, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $2, 1856(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $3, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $3, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $3, 1792(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $4, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $4, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $4, 1728(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $5, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $5, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $5, 1664(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $6, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $6, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $6, 1600(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $7, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $7, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $7, 1536(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $8, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $8, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $8, 1472(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $9, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $9, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $9, 1408(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $10, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $10, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $10, 1344(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $11, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $11, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $11, 1280(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $12, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $12, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $12, 1216(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $13, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $13, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $13, 1152(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $14, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $14, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $14, 1088(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $15, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT: vpinsrb $15, (%rax,%rcx), %xmm4, %xmm3 +; NOBW-NEXT: vpinsrb $15, 1024(%rsp,%rax), %xmm4, %xmm3 ; NOBW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; NOBW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 ; NOBW-NEXT: movq %rbp, %rsp diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll index 12530adf15cb..2178eb70cdec 100644 --- a/test/CodeGen/X86/vector-compare-results.ll +++ b/test/CodeGen/X86/vector-compare-results.ll @@ -2384,11 +2384,10 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind { ; AVX512F-LABEL: test_cmp_v32f32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcmpltps %zmm0, %zmm2, %k1 -; AVX512F-NEXT: movl {{.*}}(%rip), %eax -; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vcmpltps %zmm1, %zmm3, %k1 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 @@ -2399,12 +2398,11 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind { ; ; AVX512DQ-LABEL: test_cmp_v32f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k1 -; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k1 -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0 @@ -2893,11 +2891,10 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind { ; AVX512F-LABEL: test_cmp_v32i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 -; AVX512F-NEXT: movl {{.*}}(%rip), %eax -; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtd %zmm3, %zmm1, %k1 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 @@ -2908,12 +2905,11 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind { ; ; AVX512DQ-LABEL: test_cmp_v32i32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 -; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512DQ-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpcmpgtd %zmm3, %zmm1, %k1 -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vpcmpgtd %zmm3, %zmm1, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0 @@ -5965,13 +5961,12 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind ; AVX512F-NEXT: vcmpltpd %zmm0, %zmm4, %k0 ; AVX512F-NEXT: vcmpltpd %zmm1, %zmm5, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT: movl {{.*}}(%rip), %eax -; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vcmpltpd %zmm2, %zmm6, %k0 ; AVX512F-NEXT: vcmpltpd %zmm3, %zmm7, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 @@ -5984,14 +5979,13 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm4, %k0 ; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm5, %k1 -; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vcmpltpd %zmm2, %zmm6, %k0 ; AVX512DQ-NEXT: vcmpltpd %zmm3, %zmm7, %k1 -; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0 @@ -6588,13 +6582,12 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; AVX512F-NEXT: vpcmpgtq %zmm4, %zmm0, %k0 ; AVX512F-NEXT: vpcmpgtq %zmm5, %zmm1, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT: movl {{.*}}(%rip), %eax -; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpcmpgtq %zmm6, %zmm2, %k0 ; AVX512F-NEXT: vpcmpgtq %zmm7, %zmm3, %k1 ; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 @@ -6607,14 +6600,13 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind { ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpcmpgtq %zmm4, %zmm0, %k0 ; AVX512DQ-NEXT: vpcmpgtq %zmm5, %zmm1, %k1 -; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpcmpgtq %zmm6, %zmm2, %k0 ; AVX512DQ-NEXT: vpcmpgtq %zmm7, %zmm3, %k1 -; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k1 -; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll index 44fe38fa86b9..4da23e539f6a 100644 --- a/test/CodeGen/X86/vector-half-conversions.ll +++ b/test/CodeGen/X86/vector-half-conversions.ll @@ -2195,8 +2195,7 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind { ; AVX512VL-NEXT: shlq $32, %rdx ; AVX512VL-NEXT: orq %rcx, %rdx ; AVX512VL-NEXT: vmovq %rdx, %xmm0 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512VL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -2205,108 +2204,30 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind { } define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { -; AVX1-LABEL: cvt_4f32_to_8i16_zero: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: shll $16, %eax -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: movzwl %cx, %ecx -; AVX1-NEXT: orl %eax, %ecx -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: shll $16, %eax -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %edx -; AVX1-NEXT: movzwl %dx, %edx -; AVX1-NEXT: orl %eax, %edx -; AVX1-NEXT: shlq $32, %rdx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_4f32_to_8i16_zero: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: shll $16, %eax -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: movzwl %cx, %ecx -; AVX2-NEXT: orl %eax, %ecx -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: shll $16, %eax -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %edx -; AVX2-NEXT: movzwl %dx, %edx -; AVX2-NEXT: orl %eax, %edx -; AVX2-NEXT: shlq $32, %rdx -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512F-LABEL: cvt_4f32_to_8i16_zero: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: shll $16, %eax -; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX512F-NEXT: vmovd %xmm1, %ecx -; AVX512F-NEXT: movzwl %cx, %ecx -; AVX512F-NEXT: orl %eax, %ecx -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: shll $16, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %edx -; AVX512F-NEXT: movzwl %dx, %edx -; AVX512F-NEXT: orl %eax, %edx -; AVX512F-NEXT: shlq $32, %rdx -; AVX512F-NEXT: orq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_4f32_to_8i16_zero: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: shll $16, %eax -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %ecx -; AVX512VL-NEXT: movzwl %cx, %ecx -; AVX512VL-NEXT: orl %eax, %ecx -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: shll $16, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %edx -; AVX512VL-NEXT: movzwl %dx, %edx -; AVX512VL-NEXT: orl %eax, %edx -; AVX512VL-NEXT: shlq $32, %rdx -; AVX512VL-NEXT: orq %rcx, %rdx -; AVX512VL-NEXT: vmovq %rdx, %xmm0 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512VL-NEXT: retq +; ALL-LABEL: cvt_4f32_to_8i16_zero: +; ALL: # %bb.0: +; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: shll $16, %eax +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; ALL-NEXT: vmovd %xmm1, %ecx +; ALL-NEXT: movzwl %cx, %ecx +; ALL-NEXT: orl %eax, %ecx +; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: shll $16, %eax +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %edx +; ALL-NEXT: movzwl %dx, %edx +; ALL-NEXT: orl %eax, %edx +; ALL-NEXT: shlq $32, %rdx +; ALL-NEXT: orq %rcx, %rdx +; ALL-NEXT: vmovq %rdx, %xmm0 +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> @@ -2715,8 +2636,7 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw ; AVX512VL-NEXT: shlq $32, %rdx ; AVX512VL-NEXT: orq %rcx, %rdx ; AVX512VL-NEXT: vmovq %rdx, %xmm0 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi) ; AVX512VL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> @@ -2727,112 +2647,31 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw } define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind { -; AVX1-LABEL: store_cvt_4f32_to_8i16_zero: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: shll $16, %eax -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: movzwl %cx, %ecx -; AVX1-NEXT: orl %eax, %ecx -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: shll $16, %eax -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %edx -; AVX1-NEXT: movzwl %dx, %edx -; AVX1-NEXT: orl %eax, %edx -; AVX1-NEXT: shlq $32, %rdx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vmovdqa %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_cvt_4f32_to_8i16_zero: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: shll $16, %eax -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: movzwl %cx, %ecx -; AVX2-NEXT: orl %eax, %ecx -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %eax -; AVX2-NEXT: shll $16, %eax -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %edx -; AVX2-NEXT: movzwl %dx, %edx -; AVX2-NEXT: orl %eax, %edx -; AVX2-NEXT: shlq $32, %rdx -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vmovdqa %xmm0, (%rdi) -; AVX2-NEXT: retq -; -; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: shll $16, %eax -; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX512F-NEXT: vmovd %xmm1, %ecx -; AVX512F-NEXT: movzwl %cx, %ecx -; AVX512F-NEXT: orl %eax, %ecx -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: shll $16, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, %edx -; AVX512F-NEXT: movzwl %dx, %edx -; AVX512F-NEXT: orl %eax, %edx -; AVX512F-NEXT: shlq $32, %rdx -; AVX512F-NEXT: orq %rcx, %rdx -; AVX512F-NEXT: vmovq %rdx, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vmovdqa %xmm0, (%rdi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: shll $16, %eax -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %ecx -; AVX512VL-NEXT: movzwl %cx, %ecx -; AVX512VL-NEXT: orl %eax, %ecx -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: shll $16, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %edx -; AVX512VL-NEXT: movzwl %dx, %edx -; AVX512VL-NEXT: orl %eax, %edx -; AVX512VL-NEXT: shlq $32, %rdx -; AVX512VL-NEXT: orq %rcx, %rdx -; AVX512VL-NEXT: vmovq %rdx, %xmm0 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi) -; AVX512VL-NEXT: retq +; ALL-LABEL: store_cvt_4f32_to_8i16_zero: +; ALL: # %bb.0: +; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: shll $16, %eax +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; ALL-NEXT: vmovd %xmm1, %ecx +; ALL-NEXT: movzwl %cx, %ecx +; ALL-NEXT: orl %eax, %ecx +; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT: vmovd %xmm1, %eax +; ALL-NEXT: shll $16, %eax +; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %edx +; ALL-NEXT: movzwl %dx, %edx +; ALL-NEXT: orl %eax, %edx +; ALL-NEXT: shlq $32, %rdx +; ALL-NEXT: orq %rcx, %rdx +; ALL-NEXT: vmovq %rdx, %xmm0 +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; ALL-NEXT: vmovdqa %xmm0, (%rdi) +; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> @@ -3389,8 +3228,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { ; AVX512VL-NEXT: shlq $32, %rax ; AVX512VL-NEXT: orq %r14, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512VL-NEXT: addq $40, %rsp ; AVX512VL-NEXT: popq %rbx ; AVX512VL-NEXT: popq %r14 @@ -3478,84 +3316,43 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; AVX2-NEXT: popq %r14 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: cvt_4f64_to_8i16_zero: -; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %r14 -; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: subq $40, %rsp -; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movl %eax, %ebx -; AVX512F-NEXT: shll $16, %ebx -; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movzwl %ax, %r14d -; AVX512F-NEXT: orl %ebx, %r14d -; AVX512F-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movl %eax, %ebx -; AVX512F-NEXT: shll $16, %ebx -; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: orl %ebx, %eax -; AVX512F-NEXT: shlq $32, %rax -; AVX512F-NEXT: orq %r14, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: addq $40, %rsp -; AVX512F-NEXT: popq %rbx -; AVX512F-NEXT: popq %r14 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: cvt_4f64_to_8i16_zero: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: pushq %r14 -; AVX512VL-NEXT: pushq %rbx -; AVX512VL-NEXT: subq $40, %rsp -; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movl %eax, %ebx -; AVX512VL-NEXT: shll $16, %ebx -; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movzwl %ax, %r14d -; AVX512VL-NEXT: orl %ebx, %r14d -; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movl %eax, %ebx -; AVX512VL-NEXT: shll $16, %ebx -; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: orl %ebx, %eax -; AVX512VL-NEXT: shlq $32, %rax -; AVX512VL-NEXT: orq %r14, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512VL-NEXT: addq $40, %rsp -; AVX512VL-NEXT: popq %rbx -; AVX512VL-NEXT: popq %r14 -; AVX512VL-NEXT: retq +; AVX512-LABEL: cvt_4f64_to_8i16_zero: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: shll $16, %ebx +; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movzwl %ax, %r14d +; AVX512-NEXT: orl %ebx, %r14d +; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: shll $16, %ebx +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: orl %ebx, %eax +; AVX512-NEXT: shlq $32, %rax +; AVX512-NEXT: orq %r14, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> @@ -4095,8 +3892,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun ; AVX512VL-NEXT: shlq $32, %rax ; AVX512VL-NEXT: orq %rbx, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512VL-NEXT: vmovdqa %xmm0, (%r14) ; AVX512VL-NEXT: addq $32, %rsp ; AVX512VL-NEXT: popq %rbx @@ -4195,92 +3991,47 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; -; AVX512F-LABEL: store_cvt_4f64_to_8i16_zero: -; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %rbp -; AVX512F-NEXT: pushq %r14 -; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: subq $32, %rsp -; AVX512F-NEXT: movq %rdi, %r14 -; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movl %eax, %ebp -; AVX512F-NEXT: shll $16, %ebp -; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movzwl %ax, %ebx -; AVX512F-NEXT: orl %ebp, %ebx -; AVX512F-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movl %eax, %ebp -; AVX512F-NEXT: shll $16, %ebp -; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512F-NEXT: callq __truncdfhf2 -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: orl %ebp, %eax -; AVX512F-NEXT: shlq $32, %rax -; AVX512F-NEXT: orq %rbx, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vmovdqa %xmm0, (%r14) -; AVX512F-NEXT: addq $32, %rsp -; AVX512F-NEXT: popq %rbx -; AVX512F-NEXT: popq %r14 -; AVX512F-NEXT: popq %rbp -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: store_cvt_4f64_to_8i16_zero: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: pushq %rbp -; AVX512VL-NEXT: pushq %r14 -; AVX512VL-NEXT: pushq %rbx -; AVX512VL-NEXT: subq $32, %rsp -; AVX512VL-NEXT: movq %rdi, %r14 -; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movl %eax, %ebp -; AVX512VL-NEXT: shll $16, %ebp -; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movzwl %ax, %ebx -; AVX512VL-NEXT: orl %ebp, %ebx -; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movl %eax, %ebp -; AVX512VL-NEXT: shll $16, %ebp -; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512VL-NEXT: callq __truncdfhf2 -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: orl %ebp, %eax -; AVX512VL-NEXT: shlq $32, %rax -; AVX512VL-NEXT: orq %rbx, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovdqa %xmm0, (%r14) -; AVX512VL-NEXT: addq $32, %rsp -; AVX512VL-NEXT: popq %rbx -; AVX512VL-NEXT: popq %r14 -; AVX512VL-NEXT: popq %rbp -; AVX512VL-NEXT: retq +; AVX512-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $32, %rsp +; AVX512-NEXT: movq %rdi, %r14 +; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: shll $16, %ebp +; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movzwl %ax, %ebx +; AVX512-NEXT: orl %ebp, %ebx +; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: shll $16, %ebp +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: orl %ebp, %eax +; AVX512-NEXT: shlq $32, %rax +; AVX512-NEXT: orq %rbx, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vmovdqa %xmm0, (%r14) +; AVX512-NEXT: addq $32, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll index b40c9eddd46b..8af96c168be6 100644 --- a/test/CodeGen/X86/vector-rotate-128.ll +++ b/test/CodeGen/X86/vector-rotate-128.ll @@ -699,20 +699,35 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: var_rotate_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512-NEXT: vpsubb %xmm1, %xmm2, %xmm2 -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm1 -; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero -; AVX512-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: var_rotate_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: var_rotate_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VL-NEXT: vpsllvw %ymm1, %ymm0, %ymm1 +; AVX512VL-NEXT: vpmovwb %ymm1, %xmm1 +; AVX512VL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VL-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512VL-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq ; ; XOP-LABEL: var_rotate_v16i8: ; XOP: # %bb.0: @@ -1249,16 +1264,29 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: constant_rotate_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm1 -; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512BW-LABEL: constant_rotate_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512VL-LABEL: constant_rotate_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm1 +; AVX512VL-NEXT: vpmovwb %ymm1, %xmm1 +; AVX512VL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512VL-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq ; ; XOP-LABEL: constant_rotate_v16i8: ; XOP: # %bb.0: diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll index ea33f22cc07a..ca670f40ab3f 100644 --- a/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -531,23 +531,42 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: var_shift_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512DQ-LABEL: var_shift_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: var_shift_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512VL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: var_shift_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v16i8: ; X32-SSE: # %bb.0: @@ -948,25 +967,46 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: splatvar_shift_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512DQ-LABEL: splatvar_shift_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: splatvar_shift_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: splatvar_shift_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v16i8: ; X32-SSE: # %bb.0: @@ -1441,21 +1481,39 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: constant_shift_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: constant_shift_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v16i8: ; X32-SSE: # %bb.0: diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll index 307cf287219d..890cedf97c9d 100644 --- a/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -451,23 +451,42 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: var_shift_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512DQ-LABEL: var_shift_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: var_shift_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: var_shift_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v16i8: ; X32-SSE: # %bb.0: @@ -753,25 +772,46 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: splatvar_shift_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512DQ-LABEL: splatvar_shift_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: splatvar_shift_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: splatvar_shift_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v16i8: ; X32-SSE: # %bb.0: @@ -1148,21 +1188,39 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: constant_shift_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: constant_shift_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v16i8: ; X32-SSE: # %bb.0: diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll index b518ad5fcffd..9481e46c0c52 100644 --- a/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/test/CodeGen/X86/vector-shift-shl-128.ll @@ -401,23 +401,42 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: var_shift_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512DQ-LABEL: var_shift_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: var_shift_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: var_shift_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: var_shift_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: var_shift_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: var_shift_v16i8: ; X32-SSE: # %bb.0: @@ -695,25 +714,46 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 ; XOPAVX2-NEXT: retq ; -; AVX512-LABEL: splatvar_shift_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512DQ-LABEL: splatvar_shift_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: splatvar_shift_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: splatvar_shift_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: splatvar_shift_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: splatvar_shift_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: splatvar_shift_v16i8: ; X32-SSE: # %bb.0: @@ -998,21 +1038,39 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 ; XOP-NEXT: retq ; -; AVX512-LABEL: constant_shift_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512DQ-LABEL: constant_shift_v16i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq ; -; AVX512VL-LABEL: constant_shift_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq +; AVX512BW-LABEL: constant_shift_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v16i8: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v16i8: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq ; ; X32-SSE-LABEL: constant_shift_v16i8: ; X32-SSE: # %bb.0: diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 2fcbd89b857e..2f5a2b116115 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -3,8 +3,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLVBMI define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -58,17 +60,10 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01( ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ret <16 x i8> %shuffle } @@ -93,17 +88,10 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08( ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> ret <16 x i8> %shuffle } @@ -115,11 +103,27 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03( ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; AVX: # %bb.0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; AVX1: # %bb.0: +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3> ret <16 x i8> %shuffle } @@ -131,11 +135,27 @@ define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07( ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; AVX: # %bb.0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; AVX1: # %bb.0: +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7> ret <16 x i8> %shuffle } @@ -1203,12 +1223,25 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00( ; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; AVX1OR2: # %bb.0: # %entry +; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero +; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] +; AVX1OR2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; AVX512VLBW: # %bb.0: # %entry +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero +; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] +; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; AVX512VLVBMI: # %bb.0: # %entry +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = <u,10,2,7,22,14,7,2,18,3,1,14,18,9,11,0> +; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 +; AVX512VLVBMI-NEXT: retq entry: %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0> diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 60bc36948d23..072d71fae570 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -3,8 +3,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST define <8 x i16> @shuffle_v8i16_01012323(<8 x i16> %a, <8 x i16> %b) { ; SSE-LABEL: shuffle_v8i16_01012323: @@ -85,11 +87,33 @@ define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) { ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_00004444: -; AVX: # %bb.0: -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_00004444: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_00004444: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_00004444: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_00004444: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_00004444: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } @@ -126,11 +150,33 @@ define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) { ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_31206745: -; AVX: # %bb.0: -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_31206745: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_31206745: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_31206745: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_31206745: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_31206745: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 5> ret <8 x i16> %shuffle } @@ -179,11 +225,33 @@ define <8 x i16> @shuffle_v8i16_23026745(<8 x i16> %a, <8 x i16> %b) { ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_23026745: -; AVX: # %bb.0: -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_23026745: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_23026745: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_23026745: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_23026745: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_23026745: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 5> ret <8 x i16> %shuffle } @@ -194,11 +262,33 @@ define <8 x i16> @shuffle_v8i16_23016747(<8 x i16> %a, <8 x i16> %b) { ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_23016747: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_23016747: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_23016747: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_23016747: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_23016747: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_23016747: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 7> ret <8 x i16> %shuffle } @@ -597,11 +687,33 @@ define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) { ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_04404567: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_04404567: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_04404567: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_04404567: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_04404567: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_04404567: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 7> ret <8 x i16> %shuffle } @@ -700,17 +812,10 @@ define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_0127XXXX: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_0127XXXX: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_0127XXXX: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } @@ -733,17 +838,10 @@ define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_XXXX4563: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_XXXX4563: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_XXXX4563: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 3> ret <8 x i16> %shuffle } @@ -766,17 +864,10 @@ define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_4563XXXX: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_4563XXXX: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_4563XXXX: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } @@ -799,17 +890,10 @@ define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_01274563: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_01274563: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,2] -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_01274563: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 3> ret <8 x i16> %shuffle } @@ -832,17 +916,10 @@ define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_45630127: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_45630127: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_45630127: +; AVX: # %bb.0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 7> ret <8 x i16> %shuffle } @@ -980,12 +1057,38 @@ define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) { ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_109832ba: -; AVX: # %bb.0: -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] -; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_109832ba: +; AVX1: # %bb.0: +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_109832ba: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_109832ba: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_109832ba: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_109832ba: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10> ret <8 x i16> %shuffle } @@ -1028,13 +1131,43 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) { ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_0213cedf: -; AVX: # %bb.0: -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_0213cedf: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_0213cedf: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_0213cedf: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_0213cedf: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15] +; AVX512VL-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX512VL-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15> ret <8 x i16> %shuffle } @@ -1064,12 +1197,38 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) { ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: shuffle_v8i16_443aXXXX: -; AVX: # %bb.0: -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v8i16_443aXXXX: +; AVX1: # %bb.0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_443aXXXX: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_443aXXXX: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_443aXXXX: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_443aXXXX: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } @@ -1336,13 +1495,35 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) { ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i16_XXX1X579: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i16_XXX1X579: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_XXX1X579: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXX1X579: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_XXX1X579: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15] +; AVX512VL-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9> ret <8 x i16> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index 11f25a2d687d..cbd1b83a4eb2 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -156,15 +158,24 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: @@ -214,12 +225,19 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: @@ -241,12 +259,19 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: @@ -373,11 +398,27 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> ret <16 x i16> %shuffle } @@ -393,11 +434,27 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> ret <16 x i16> %shuffle } @@ -413,11 +470,27 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12> ret <16 x i16> %shuffle } @@ -433,11 +506,27 @@ define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15> ret <16 x i16> %shuffle } @@ -453,11 +542,27 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> ret <16 x i16> %shuffle } @@ -473,11 +578,27 @@ define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15] -; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15] +; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> ret <16 x i16> %shuffle } @@ -789,13 +910,20 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: ; AVX512VL: # %bb.0: @@ -850,12 +978,18 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: -; AVX2: # %bb.0: -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,14,15,12,13,10,11,8,9,22,23,20,21,18,19,16,17,30,31,28,29,26,27,24,25] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: ; AVX512VL: # %bb.0: @@ -880,13 +1014,20 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,6,7,4,5,2,3,0,1,16,17,18,19,20,21,22,23,22,23,20,21,18,19,16,17] +; AVX2-FAST-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: ; AVX512VL: # %bb.0: @@ -1349,12 +1490,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: -; AVX2: # %bb.0: -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: ; AVX512VL: # %bb.0: @@ -1376,12 +1523,18 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: -; AVX2: # %bb.0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: ; AVX512VL: # %bb.0: @@ -1404,12 +1557,18 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: -; AVX2: # %bb.0: -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: ; AVX512VL: # %bb.0: @@ -1431,12 +1590,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: -; AVX2: # %bb.0: -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: ; AVX512VL: # %bb.0: @@ -1877,15 +2042,24 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: ; AVX512VL: # %bb.0: @@ -1909,15 +2083,24 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT: vpsllq $48, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: ; AVX512VL: # %bb.0: @@ -1995,14 +2178,22 @@ define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11,22,23,18,19,20,21,16,17,28,29,30,31,24,25,26,27] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: ; AVX512VL: # %bb.0: @@ -2081,14 +2272,22 @@ define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255> -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,2,4,5,6,7,10,11,8,10,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255> +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,2,4,5,6,7,10,11,8,10,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11,20,21,22,23,16,17,20,21,28,29,30,31,24,25,26,27] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: ; AVX512VL: # %bb.0: @@ -2110,14 +2309,22 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15,20,21,22,23,16,17,18,19,28,29,30,31,24,25,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: ; AVX512VL: # %bb.0: @@ -2396,12 +2603,19 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: ; AVX512VL: # %bb.0: @@ -2540,14 +2754,22 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,0,4,5,6,7,8,10,10,8,12,13,14,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,0,4,5,6,7,8,10,10,8,12,13,14,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15,16,17,24,25,24,25,16,17,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: ; AVX512VL: # %bb.0: @@ -2655,17 +2877,10 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_u ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15,16,17,18,19,20,21,30,31,20,21,30,31,28,29,30,31] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,7,6,7,8,9,10,11,12,15,14,15] -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15,16,17,18,19,20,21,30,31,20,21,30,31,28,29,30,31] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 10, i32 15, i32 undef, i32 undef, i32 undef, i32 undef> ret <16 x i16> %shuffle } @@ -2705,17 +2920,10 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3,24,25,26,27,28,29,22,23,24,25,26,27,16,17,18,19] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,2,0,7,5,6,4] -; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,3,4,5,6,7,8,11,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3,24,25,26,27,28,29,22,23,24,25,26,27,16,17,18,19] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11, i32 undef, i32 undef, i32 undef, i32 undef> ret <16 x i16> %shuffle } @@ -2803,12 +3011,19 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: ; AVX512VL: # %bb.0: @@ -2957,16 +3172,25 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,2,3,8,9,12,13,12,13,14,15,16,17,16,17,20,21,18,19,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: ; AVX512VL: # %bb.0: @@ -2992,16 +3216,26 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u> -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: ; AVX512VL: # %bb.0: @@ -3123,15 +3357,24 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_3 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: -; AVX2: # %bb.0: -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: ; AVX512VL: # %bb.0: @@ -3156,12 +3399,18 @@ define <16 x i16> @shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_u ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu: -; AVX2: # %bb.0: -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,2,4,5,6,7,8,8,11,10,12,13,14,15] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,2,4,5,6,7,8,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15,24,25,24,25,22,23,20,21,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu: ; AVX512VL: # %bb.0: @@ -3363,14 +3612,22 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15,16,17,18,19,20,21,18,19,24,25,26,27,30,31,30,31] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: ; AVX512VL: # %bb.0: @@ -3783,12 +4040,31 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; AVX512VL-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> ret <16 x i16> %shuffle } @@ -3821,11 +4097,33 @@ define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16> } define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { -; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: -; ALL: # %bb.0: -; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; ALL-NEXT: retq +; AVX1-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ret <16 x i16> %shuffle } @@ -3838,12 +4136,31 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ret <16 x i16> %shuffle } @@ -3906,9 +4223,8 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2 ; ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25] -; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,10,26,11,27,0,16,1,17,8,24,9,25] +; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25> %2 = bitcast <16 x i16> %1 to <4 x i64> @@ -4006,21 +4322,36 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: PR24935: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0> -; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: PR24935: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: PR24935: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,2,3,6,7,10,11,10,11,12,13,14,15,16,17,18,19,18,19,22,23,26,27,26,27,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: PR24935: ; AVX512VL: # %bb.0: diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 01c7fc466eb8..51ef3a18438f 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1,7 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLBW --check-prefix=AVX512VLBW-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLBW --check-prefix=AVX512VLBW-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLVBMI --check-prefix=AVX512VLVBMI-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLVBMI --check-prefix=AVX512VLVBMI-FAST define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -312,17 +316,23 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_ ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpbroadcastb %xmm0, %xmm0 -; AVX512VL-NEXT: movl $32767, %eax # imm = 0x7FFF -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX512VLBW-NEXT: movl $32767, %eax # imm = 0x7FFF +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -346,14 +356,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: movl $1, %eax -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-NEXT: movl $1, %eax +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -377,14 +393,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: movw $1, %ax -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-NEXT: movw $1, %ax +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -408,14 +430,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: movw $1, %ax -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-NEXT: movw $1, %ax +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -431,12 +459,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-FAST-NEXT: retq +; +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -452,12 +507,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-FAST-NEXT: retq +; +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -473,12 +555,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-FAST-NEXT: retq +; +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -494,12 +603,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-FAST-NEXT: retq +; +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX512VLBW-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -522,12 +658,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -550,12 +698,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -578,12 +738,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -606,12 +778,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -634,12 +818,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -662,12 +858,24 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -690,12 +898,24 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -722,14 +942,29 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT: movl $15, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT: movl $15, %eax +; AVX512VLBW-SLOW-NEXT: vmovd %eax, %xmm1 +; AVX512VLBW-SLOW-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT: movl $15, %eax +; AVX512VLBW-FAST-NEXT: vmovd %eax, %xmm1 +; AVX512VLBW-FAST-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: movl $31, %eax +; AVX512VLVBMI-NEXT: vmovd %eax, %xmm1 +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <32 x i8> %shuffle } @@ -1092,25 +1327,49 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VLBW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-SLOW-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VLBW-SLOW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1} +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX512VLBW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-FAST-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VLBW-FAST-NEXT: kmovd %eax, %k1 +; AVX512VLBW-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1} +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,0,32,0,32,0,32,0,32,0,32,0,32,0,32,16,48,16,48,16,48,16,48,16,48,16,48,16,48,16,48] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48> ret <32 x i8> %shuffle } @@ -1129,12 +1388,26 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2OR512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,40,41,42,43,44,45,46,47,16,16,16,16,16,16,16,16,56,57,58,59,60,61,62,63] +; AVX512VLVBMI-NEXT: vpermi2b %ymm0, %ymm1, %ymm2 +; AVX512VLVBMI-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> ret <32 x i8> %shuffle } @@ -1155,11 +1428,24 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,47,46,45,44,43,42,41,40,23,22,21,20,19,18,17,16,63,62,61,60,59,58,57,56] +; AVX512VLVBMI-NEXT: vpermi2b %ymm0, %ymm1, %ymm2 +; AVX512VLVBMI-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24> ret <32 x i8> %shuffle } @@ -1177,12 +1463,26 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] +; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,39,38,37,36,35,34,33,32,23,22,21,20,19,18,17,16,55,54,53,52,51,50,49,48] +; AVX512VLVBMI-NEXT: vpermi2b %ymm0, %ymm1, %ymm2 +; AVX512VLVBMI-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16> ret <32 x i8> %shuffle } @@ -1350,13 +1650,19 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_ ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u] -; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31] -; AVX512VL-NEXT: retq +; AVX512VLBW-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u] +; AVX512VLBW-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> ret <32 x i8> %shuffle } @@ -1379,13 +1685,19 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_ ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u] -; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23] -; AVX512VL-NEXT: retq +; AVX512VLBW-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u] +; AVX512VLBW-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47,16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> ret <32 x i8> %shuffle } @@ -1634,22 +1946,29 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_ ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] -; AVX512VL-NEXT: movl $-222248896, %eax # imm = 0xF2C0C040 -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] -; AVX512VL-NEXT: movl $134948620, %eax # imm = 0x80B270C -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 -; AVX512VL-NEXT: retq +; AVX512VLBW-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] +; AVX512VLBW-NEXT: movl $-222248896, %eax # imm = 0xF2C0C040 +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] +; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] +; AVX512VLBW-NEXT: movl $134948620, %eax # imm = 0x80B270C +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [10,13,44,45,3,3,28,8,49,54,61,12,1,44,16,19,52,51,20,51,17,22,5,0,16,10,27,39,4,2,4,7] +; AVX512VLVBMI-NEXT: vpermi2b %ymm0, %ymm1, %ymm2 +; AVX512VLVBMI-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 42, i32 45, i32 12, i32 13, i32 35, i32 35, i32 60, i32 40, i32 17, i32 22, i32 29, i32 44, i32 33, i32 12, i32 48, i32 51, i32 20, i32 19, i32 52, i32 19, i32 49, i32 54, i32 37, i32 32, i32 48, i32 42, i32 59, i32 7, i32 36, i32 34, i32 36, i32 39> ret <32 x i8> %shuffle } @@ -1663,11 +1982,23 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX2: # %bb.0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,32,32,32,32,32,32,32,32,40,40,40,40,40,40,40,40] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40> ret <32 x i8> %shuffle } @@ -1682,11 +2013,23 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24,32,32,32,32,32,32,32,32,40,40,40,40,40,40,40,40] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40> ret <32 x i8> %shuffle } @@ -1702,11 +2045,23 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX2: # %bb.0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24,48,48,48,48,48,48,48,48,56,56,56,56,56,56,56,56] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56> ret <32 x i8> %shuffle } @@ -1721,11 +2076,23 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,48,48,48,48,48,48,48,48,56,56,56,56,56,56,56,56] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56> ret <32 x i8> %shuffle } @@ -1738,12 +2105,25 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512VLBW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47> ret <32 x i8> %shuffle } @@ -1961,18 +2341,26 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-NEXT: movl $286331153, %eax # imm = 0x11111111 -; AVX512VL-NEXT: kmovd %eax, %k1 -; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm0 {%k1} {z} -; AVX512VL-NEXT: retq +; AVX512VLBW-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VLBW-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VLBW-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VLBW-NEXT: movl $286331153, %eax # imm = 0x11111111 +; AVX512VLBW-NEXT: kmovd %eax, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm0 {%k1} {z} +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [56,1,2,3,57,5,6,7,58,9,10,11,59,13,14,15,60,17,18,19,61,21,22,23,62,25,26,27,63,29,30,31] +; AVX512VLVBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLVBMI-NEXT: vpermt2b %ymm0, %ymm2, %ymm1 +; AVX512VLVBMI-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 56, i32 1, i32 2, i32 3, i32 57, i32 5, i32 6, i32 7, i32 58, i32 9, i32 10, i32 11, i32 59, i32 13, i32 14, i32 15, i32 60, i32 17, i32 18, i32 19, i32 61, i32 21, i32 22, i32 23, i32 62, i32 25, i32 26, i32 27, i32 63, i32 29, i32 30, i32 31> ret <32 x i8> %shuffle } @@ -2159,19 +2547,11 @@ define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10> ret <32 x i8> %shuffle } @@ -2203,37 +2583,21 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_ ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] +; AVX2OR512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ret <32 x i8> %shuffle } define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { -; AVX1OR2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512VL-NEXT: retq +; ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; ALL: # %bb.0: +; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; ALL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ret <32 x i8> %shuffle } @@ -2245,19 +2609,11 @@ define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_ ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ret <32 x i8> %shuffle } @@ -2276,13 +2632,27 @@ define <32 x i8> @shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2OR512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> %3 = bitcast <16 x i16> %1 to <32 x i8> @@ -2310,11 +2680,29 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) { ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: PR28136: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: PR28136: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: PR28136: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-SLOW-LABEL: PR28136: +; AVX512VLVBMI-SLOW: # %bb.0: +; AVX512VLVBMI-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VLVBMI-SLOW-NEXT: retq +; +; AVX512VLVBMI-FAST-LABEL: PR28136: +; AVX512VLVBMI-FAST: # %bb.0: +; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,16,48,17,49,18,50,19,51,4,36,5,37,6,38,7,39,20,52,21,53,22,54,23,55] +; AVX512VLVBMI-FAST-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-FAST-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50,i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> %2 = bitcast <32 x i8> %1 to <4 x i64> %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3> diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 41dcb5032ee2..ebef762787d9 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: shuffle_v4f64_0000: @@ -546,19 +548,29 @@ define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) { ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v4f64_0z3z: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] -; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f64_0z3z: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; AVX512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v4f64_0z3z: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] +; AVX2-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v4f64_0z3z: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4f64_0z3z: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] +; AVX512VL-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v4f64_0z3z: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 0, i32 4, i32 3, i32 4> ret <4 x double> %shuffle } @@ -574,19 +586,29 @@ define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v4f64_1z2z: -; AVX2: # %bb.0: -; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4f64_1z2z: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] -; AVX512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v4f64_1z2z: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v4f64_1z2z: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4f64_1z2z: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v4f64_1z2z: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-FAST-NEXT: retq %1 = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4> ret <4 x double> %1 } @@ -812,11 +834,17 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v4i64_0124: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v4i64_0124: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_0124: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,4] +; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4> ret <4 x i64> %shuffle } @@ -863,12 +891,19 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v4i64_0412: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v4i64_0412: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_0412: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2] +; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2> ret <4 x i64> %shuffle } @@ -889,11 +924,17 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v4i64_4012: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2] -; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v4i64_4012: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2] +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_4012: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,2] +; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2> ret <4 x i64> %shuffle } @@ -924,9 +965,8 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0451: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,5,1] +; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1> ret <4 x i64> %shuffle @@ -958,9 +998,8 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_4015: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,0,1,5] +; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5> ret <4 x i64> %shuffle @@ -980,11 +1019,17 @@ define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) { ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v4i64_2u35: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1] -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v4i64_2u35: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_2u35: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,5,3,5] +; AVX512VL-FAST-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5> ret <4 x i64> %shuffle } @@ -1008,9 +1053,8 @@ define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_1251: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,2,1] -; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,5,1] +; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1> ret <4 x i64> %shuffle @@ -1121,9 +1165,8 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) { ; ; AVX512VL-LABEL: shuffle_v4i64_0415: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,4,1,5] +; AVX512VL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> ret <4 x i64> %shuffle @@ -1564,19 +1607,29 @@ define <4 x i64> @shuffle_v4i64_z0z3(<4 x i64> %a, <4 x i64> %b) { ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v4i64_z0z3: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3] -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i64_z0z3: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3] -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v4i64_z0z3: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3] +; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v4i64_z0z3: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4i64_z0z3: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3] +; AVX512VL-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_z0z3: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31] +; AVX512VL-FAST-NEXT: retq %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 4, i32 0, i32 4, i32 3> ret <4 x i64> %1 } @@ -1592,19 +1645,29 @@ define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) { ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v4i64_1z2z: -; AVX2: # %bb.0: -; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v4i64_1z2z: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0] -; AVX512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v4i64_1z2z: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v4i64_1z2z: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4i64_1z2z: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_1z2z: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-FAST-NEXT: retq %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4> ret <4 x i64> %1 } diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index 44d0217f5295..1b6df7dd2d25 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1,7 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00000000: @@ -342,12 +344,26 @@ define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8f32_09ab1def: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v8f32_09ab1def: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_09ab1def: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_09ab1def: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [8,1,2,3,10,5,6,7] +; AVX512VL-FAST-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> ret <8 x float> %shuffle } @@ -651,14 +667,23 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8f32_c348cda0: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u> -; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8f32_c348cda0: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u> +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_c348cda0: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,3,4,7,4,7,2,0] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u> +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_c348cda0: ; AVX512VL: # %bb.0: @@ -681,14 +706,23 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8f32_f511235a: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8f32_f511235a: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_f511235a: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [7,6,2,3,7,6,3,2] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [5,5,1,1,2,3,5,5] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_f511235a: ; AVX512VL: # %bb.0: @@ -722,11 +756,29 @@ define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8f32_76547654: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8f32_76547654: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_76547654: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_76547654: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_76547654: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> ret <8 x float> %shuffle } @@ -738,11 +790,29 @@ define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) { ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8f32_76543210: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8f32_76543210: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_76543210: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_76543210: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_76543210: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> ret <8 x float> %shuffle } @@ -798,11 +868,23 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) { ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: PR21138: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: PR21138: +; AVX2: # %bb.0: +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: PR21138: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: PR21138: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] +; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x float> %truc, <8 x float> %tchose, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> ret <8 x float> %shuffle } @@ -1264,12 +1346,26 @@ define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_09ab1def: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2OR512VL-NEXT: retq +; AVX2-LABEL: shuffle_v8i32_09ab1def: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_09ab1def: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_09ab1def: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [8,1,2,3,10,5,6,7] +; AVX512VL-FAST-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> ret <8 x i32> %shuffle } @@ -1696,13 +1792,21 @@ define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i32_6caa87e5: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i32_6caa87e5: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i32_6caa87e5: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [4,4,2,2,0,0,6,6] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_6caa87e5: ; AVX512VL: # %bb.0: @@ -1737,11 +1841,29 @@ define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_76547654: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i32_76547654: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i32_76547654: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_76547654: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_76547654: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> ret <8 x i32> %shuffle } @@ -1753,11 +1875,29 @@ define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) { ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v8i32_76543210: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v8i32_76543210: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v8i32_76543210: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_76543210: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_76543210: +; AVX512VL-FAST: # %bb.0: +; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> ret <8 x i32> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll index 3e49957bf85e..d4fb0fd52a79 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -186,8 +186,7 @@ define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19 ; ; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28: ; SKX: ## %bb.0: -; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31] -; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[2,3,2,3,0,1,0,1,10,11,10,11,8,9,8,9,18,19,18,19,16,17,16,17,26,27,26,27,24,25,24,25,34,35,34,35,32,33,32,33,42,43,42,43,40,41,40,41,50,51,50,51,48,49,48,49,58,59,58,59,56,57,56,57] ; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 5, i32 5, i32 4, i32 4, i32 9, i32 9, i32 8, i32 8, i32 13, i32 13, i32 12, i32 12, i32 17, i32 17, i32 16, i32 16, i32 21, i32 21, i32 20, i32 20, i32 25, i32 25, i32 24, i32 24, i32 29, i32 29, i32 28, i32 28> ret <32 x i16> %c @@ -354,3 +353,18 @@ define <8 x i16> @pr32967(<32 x i16> %v) { %shuffle = shufflevector <32 x i16> %v, <32 x i16> undef, <8 x i32> <i32 1,i32 5,i32 9,i32 13,i32 17,i32 21,i32 25,i32 29> ret <8 x i16> %shuffle } + +define <32 x i16> @shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz(<32 x i16> %a) { +; KNL-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz: +; KNL: ## %bb.0: +; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[18,19],zero,zero +; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[14,15],zero,zero,ymm1[10,11],zero,zero,ymm1[6,7],zero,zero,ymm1[2,3],zero,zero,ymm1[30,31],zero,zero,ymm1[26,27],zero,zero,ymm1[22,23],zero,zero,ymm1[20,21],zero,zero +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz: +; SKX: ## %bb.0: +; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[14,15],zero,zero,zmm0[10,11],zero,zero,zmm0[6,7],zero,zero,zmm0[2,3],zero,zero,zmm0[30,31],zero,zero,zmm0[26,27],zero,zero,zmm0[22,23],zero,zero,zmm0[18,19],zero,zero,zmm0[46,47],zero,zero,zmm0[42,43],zero,zero,zmm0[38,39],zero,zero,zmm0[34,35],zero,zero,zmm0[62,63],zero,zero,zmm0[58,59],zero,zero,zmm0[54,55],zero,zero,zmm0[52,53],zero,zero +; SKX-NEXT: retq + %shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 39, i32 0, i32 37, i32 0, i32 35, i32 0, i32 33, i32 0, i32 47, i32 0, i32 45, i32 0, i32 43, i32 0, i32 41, i32 0, i32 55, i32 0, i32 53, i32 0, i32 51, i32 0, i32 49, i32 0, i32 63, i32 0, i32 61, i32 0, i32 59, i32 0, i32 58, i32 0> + ret <32 x i16> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll index 9c92ca756ebd..f3433ce834cd 100644 --- a/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/test/CodeGen/X86/vector-shuffle-v1.ll @@ -48,9 +48,8 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} -; AVX512VL-NEXT: movb $1, %al -; AVX512VL-NEXT: kmovw %eax, %k1 -; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm2 {%k1} {z} +; AVX512VL-NEXT: movq $-1, %rax +; AVX512VL-NEXT: vmovq %rax, %xmm2 ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1 ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 @@ -61,9 +60,8 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0 -; VL_BW_DQ-NEXT: movb $1, %al -; VL_BW_DQ-NEXT: kmovd %eax, %k1 -; VL_BW_DQ-NEXT: vpmovm2q %k1, %xmm0 +; VL_BW_DQ-NEXT: movq $-1, %rax +; VL_BW_DQ-NEXT: vmovq %rax, %xmm0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1 ; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 @@ -123,12 +121,12 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] -; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0] +; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -137,10 +135,10 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0: ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 -; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] -; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0] +; VL_BW_DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq @@ -250,12 +248,12 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; AVX512VL-LABEL: shuf8i1_u_2_u_u_2_u_2_u: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: kmovw %edi, %k1 -; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vpbroadcastq %xmm0, %zmm0 -; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper @@ -264,10 +262,10 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u: ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: kmovd %edi, %k0 -; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0 -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %ymm0 +; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 ; VL_BW_DQ-NEXT: vzeroupper ; VL_BW_DQ-NEXT: retq @@ -294,12 +292,14 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: kmovw %edi, %k1 -; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> -; AVX512VL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7] +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax ; AVX512VL-NEXT: vzeroupper @@ -308,11 +308,12 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u: ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: kmovd %edi, %k0 -; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> -; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7] +; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax ; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax ; VL_BW_DQ-NEXT: vzeroupper @@ -339,10 +340,11 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { ; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: kmovw %edi, %k1 -; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512VL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] -; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax ; AVX512VL-NEXT: vzeroupper @@ -351,9 +353,9 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u: ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: kmovd %edi, %k0 -; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] -; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax ; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax ; VL_BW_DQ-NEXT: vzeroupper @@ -382,12 +384,13 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: kmovw %edi, %k1 -; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] -; AVX512VL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] +; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0 +; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax ; AVX512VL-NEXT: vzeroupper @@ -396,11 +399,11 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0: ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: kmovd %edi, %k0 -; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] -; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] +; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax ; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax ; VL_BW_DQ-NEXT: vzeroupper @@ -429,12 +432,13 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: kmovw %edi, %k1 -; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7] +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax ; AVX512VL-NEXT: vzeroupper @@ -443,11 +447,11 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0: ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: kmovd %edi, %k0 -; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] -; VL_BW_DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7] +; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax ; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax ; VL_BW_DQ-NEXT: vzeroupper @@ -462,12 +466,10 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: movb $51, %al -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1] +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0] +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -478,14 +480,12 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: kmovw %edi, %k1 -; AVX512VL-NEXT: movb $51, %al -; AVX512VL-NEXT: kmovw %eax, %k2 -; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512VL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] -; AVX512VL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7] +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax ; AVX512VL-NEXT: vzeroupper @@ -494,11 +494,10 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1: ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: kmovd %edi, %k0 -; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1] -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0] -; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7] +; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax ; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax ; VL_BW_DQ-NEXT: vzeroupper @@ -528,15 +527,15 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0 -; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512VL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0 -; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0 +; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax ; AVX512VL-NEXT: vzeroupper @@ -546,11 +545,11 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; VL_BW_DQ: # %bb.0: ; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0 -; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] -; VL_BW_DQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 -; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; VL_BW_DQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax ; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax ; VL_BW_DQ-NEXT: vzeroupper diff --git a/test/CodeGen/X86/vector-shuffle-variable-128.ll b/test/CodeGen/X86/vector-shuffle-variable-128.ll index 0367737dda60..4de24d5fec4d 100644 --- a/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -414,63 +414,62 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: movzbl (%rax,%r10), %eax +; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE2-NEXT: movd %eax, %xmm8 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: movzbl (%rax,%r10), %eax +; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE2-NEXT: movd %eax, %xmm15 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: movzbl (%rax,%r10), %eax +; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE2-NEXT: movd %eax, %xmm9 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: movzbl (%rax,%r10), %eax +; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: movzbl (%rax,%r10), %eax +; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE2-NEXT: movd %eax, %xmm10 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: movzbl (%rax,%r10), %eax +; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE2-NEXT: movd %eax, %xmm7 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: movzbl (%rax,%r10), %eax +; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE2-NEXT: movd %eax, %xmm11 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: movzbl (%rax,%r10), %eax +; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE2-NEXT: movd %eax, %xmm6 -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: movzbl (%rax,%r10), %eax +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %eax ; SSE2-NEXT: movd %eax, %xmm12 -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: movzbl (%rax,%r10), %eax +; SSE2-NEXT: andl $15, %edx +; SSE2-NEXT: movzbl -24(%rsp,%rdx), %eax ; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: andl $15, %r9d -; SSE2-NEXT: movzbl (%r9,%r10), %eax +; SSE2-NEXT: andl $15, %esi +; SSE2-NEXT: movzbl -24(%rsp,%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm13 -; SSE2-NEXT: andl $15, %r8d -; SSE2-NEXT: movzbl (%r8,%r10), %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: andl $15, %ecx -; SSE2-NEXT: movzbl (%rcx,%r10), %eax +; SSE2-NEXT: andl $15, %edi +; SSE2-NEXT: movzbl -24(%rsp,%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: andl $15, %r9d +; SSE2-NEXT: movzbl -24(%rsp,%r9), %eax ; SSE2-NEXT: movd %eax, %xmm14 -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%r10), %eax +; SSE2-NEXT: andl $15, %r8d +; SSE2-NEXT: movzbl -24(%rsp,%r8), %eax ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: andl $15, %esi -; SSE2-NEXT: movzbl (%rsi,%r10), %eax +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: andl $15, %edi -; SSE2-NEXT: movzbl (%rdi,%r10), %eax -; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] @@ -479,12 +478,12 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE2-NEXT: retq ; @@ -499,63 +498,62 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %r10 -; SSSE3-NEXT: movzbl (%rax,%r10), %eax +; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm8 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r10), %eax +; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm15 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r10), %eax +; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm9 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r10), %eax +; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r10), %eax +; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm10 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r10), %eax +; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm7 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r10), %eax +; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm11 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r10), %eax +; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm6 -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r10), %eax +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %eax ; SSSE3-NEXT: movd %eax, %xmm12 -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r10), %eax +; SSSE3-NEXT: andl $15, %edx +; SSSE3-NEXT: movzbl -24(%rsp,%rdx), %eax ; SSSE3-NEXT: movd %eax, %xmm5 -; SSSE3-NEXT: andl $15, %r9d -; SSSE3-NEXT: movzbl (%r9,%r10), %eax +; SSSE3-NEXT: andl $15, %esi +; SSSE3-NEXT: movzbl -24(%rsp,%rsi), %eax ; SSSE3-NEXT: movd %eax, %xmm13 -; SSSE3-NEXT: andl $15, %r8d -; SSSE3-NEXT: movzbl (%r8,%r10), %eax -; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%r10), %eax +; SSSE3-NEXT: andl $15, %edi +; SSSE3-NEXT: movzbl -24(%rsp,%rdi), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: andl $15, %r9d +; SSSE3-NEXT: movzbl -24(%rsp,%r9), %eax ; SSSE3-NEXT: movd %eax, %xmm14 -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%r10), %eax +; SSSE3-NEXT: andl $15, %r8d +; SSSE3-NEXT: movzbl -24(%rsp,%r8), %eax ; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: andl $15, %esi -; SSSE3-NEXT: movzbl (%rsi,%r10), %eax +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: andl $15, %edi -; SSSE3-NEXT: movzbl (%rdi,%r10), %eax -; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] @@ -564,12 +562,12 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSSE3-NEXT: retq ; @@ -583,49 +581,48 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; SSE41-NEXT: # kill: def %edi killed %edi def %rdi ; SSE41-NEXT: andl $15, %edi ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; SSE41-NEXT: movzbl (%rdi,%rax), %edi -; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: movzbl -24(%rsp,%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 ; SSE41-NEXT: andl $15, %esi -; SSE41-NEXT: pinsrb $1, (%rsi,%rax), %xmm0 +; SSE41-NEXT: pinsrb $1, -24(%rsp,%rsi), %xmm0 ; SSE41-NEXT: andl $15, %edx -; SSE41-NEXT: pinsrb $2, (%rdx,%rax), %xmm0 +; SSE41-NEXT: pinsrb $2, -24(%rsp,%rdx), %xmm0 ; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $3, (%rcx,%rax), %xmm0 +; SSE41-NEXT: pinsrb $3, -24(%rsp,%rcx), %xmm0 ; SSE41-NEXT: andl $15, %r8d -; SSE41-NEXT: pinsrb $4, (%r8,%rax), %xmm0 +; SSE41-NEXT: pinsrb $4, -24(%rsp,%r8), %xmm0 ; SSE41-NEXT: andl $15, %r9d -; SSE41-NEXT: pinsrb $5, (%r9,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $6, (%rcx,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $7, (%rcx,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $8, (%rcx,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $9, (%rcx,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $10, (%rcx,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $11, (%rcx,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $12, (%rcx,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $13, (%rcx,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $14, (%rcx,%rax), %xmm0 -; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT: andl $15, %ecx -; SSE41-NEXT: pinsrb $15, (%rcx,%rax), %xmm0 +; SSE41-NEXT: pinsrb $5, -24(%rsp,%r9), %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: pinsrb $6, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: pinsrb $7, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: pinsrb $8, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: pinsrb $9, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: pinsrb $10, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: pinsrb $11, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: pinsrb $12, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: pinsrb $13, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: pinsrb $14, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: andl $15, %eax +; SSE41-NEXT: pinsrb $15, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -638,49 +635,48 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 % ; AVX-NEXT: # kill: def %edi killed %edi def %rdi ; AVX-NEXT: andl $15, %edi ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movzbl (%rdi,%rax), %edi -; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: movzbl -24(%rsp,%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: andl $15, %esi -; AVX-NEXT: vpinsrb $1, (%rsi,%rax), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $1, -24(%rsp,%rsi), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %edx -; AVX-NEXT: vpinsrb $2, (%rdx,%rax), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $3, (%rcx,%rax), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $3, -24(%rsp,%rcx), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r8d -; AVX-NEXT: vpinsrb $4, (%r8,%rax), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0 ; AVX-NEXT: andl $15, %r9d -; AVX-NEXT: vpinsrb $5, (%r9,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $6, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $7, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $8, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $9, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $10, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $11, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $12, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $13, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $14, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT: andl $15, %ecx -; AVX-NEXT: vpinsrb $15, (%rcx,%rax), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $5, -24(%rsp,%r9), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT: andl $15, %eax +; AVX-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: retq %x0 = extractelement <16 x i8> %x, i8 %i0 %x1 = extractelement <16 x i8> %x, i8 %i1 @@ -819,69 +815,68 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* ; SSE2: # %bb.0: ; SSE2-NEXT: movzbl (%rdi), %eax ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl 15(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: movzbl 14(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm15 -; SSE2-NEXT: movzbl 13(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: movzbl 12(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: movzbl 11(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm10 -; SSE2-NEXT: movzbl 10(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm7 -; SSE2-NEXT: movzbl 9(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm11 -; SSE2-NEXT: movzbl 8(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm6 -; SSE2-NEXT: movzbl 7(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm12 -; SSE2-NEXT: movzbl 6(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movzbl 5(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm13 -; SSE2-NEXT: movzbl 4(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: movzbl 3(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm14 -; SSE2-NEXT: movzbl 2(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm1 -; SSE2-NEXT: movzbl 1(%rdi), %edx -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%rcx), %edx -; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: movzbl 15(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm8 +; SSE2-NEXT: movzbl 14(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm15 +; SSE2-NEXT: movzbl 13(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm9 +; SSE2-NEXT: movzbl 12(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: movzbl 11(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm10 +; SSE2-NEXT: movzbl 10(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm7 +; SSE2-NEXT: movzbl 9(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm11 +; SSE2-NEXT: movzbl 8(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm6 +; SSE2-NEXT: movzbl 7(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm12 +; SSE2-NEXT: movzbl 6(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm5 +; SSE2-NEXT: movzbl 5(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm13 +; SSE2-NEXT: movzbl 4(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: movzbl 3(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm14 +; SSE2-NEXT: movzbl 2(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movzbl 1(%rdi), %ecx +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: movzbl (%rax,%rcx), %eax +; SSE2-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] @@ -904,69 +899,68 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* ; SSSE3: # %bb.0: ; SSSE3-NEXT: movzbl (%rdi), %eax ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT: movzbl 15(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: movzbl 14(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm15 -; SSSE3-NEXT: movzbl 13(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: movzbl 12(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: movzbl 11(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm10 -; SSSE3-NEXT: movzbl 10(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm7 -; SSSE3-NEXT: movzbl 9(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm11 -; SSSE3-NEXT: movzbl 8(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm6 -; SSSE3-NEXT: movzbl 7(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm12 -; SSSE3-NEXT: movzbl 6(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movzbl 5(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm13 -; SSSE3-NEXT: movzbl 4(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: movzbl 3(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm14 -; SSSE3-NEXT: movzbl 2(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm1 -; SSSE3-NEXT: movzbl 1(%rdi), %edx -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: movzbl 15(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm8 +; SSSE3-NEXT: movzbl 14(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm15 +; SSSE3-NEXT: movzbl 13(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm9 +; SSSE3-NEXT: movzbl 12(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: movzbl 11(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm10 +; SSSE3-NEXT: movzbl 10(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm7 +; SSSE3-NEXT: movzbl 9(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm11 +; SSSE3-NEXT: movzbl 8(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm6 +; SSSE3-NEXT: movzbl 7(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm12 +; SSSE3-NEXT: movzbl 6(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm5 +; SSSE3-NEXT: movzbl 5(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm13 +; SSSE3-NEXT: movzbl 4(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: movzbl 3(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm14 +; SSSE3-NEXT: movzbl 2(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movzbl 1(%rdi), %ecx +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%rcx), %eax +; SSSE3-NEXT: movzbl -24(%rsp,%rax), %eax ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] @@ -990,54 +984,53 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* ; SSE41-NEXT: movzbl (%rdi), %eax ; SSE41-NEXT: andl $15, %eax ; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: movzbl (%rax,%rcx), %eax +; SSE41-NEXT: movzbl -24(%rsp,%rax), %eax ; SSE41-NEXT: movd %eax, %xmm0 ; SSE41-NEXT: movzbl 1(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $1, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $1, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 2(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $2, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $2, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 3(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $3, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $3, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 4(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $4, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $4, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 5(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $5, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $5, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 6(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $6, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $6, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 7(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $7, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $7, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 8(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $8, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $8, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 9(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $9, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $9, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 10(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $10, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $10, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 11(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $11, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $11, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 12(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $12, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $12, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 13(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $13, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $13, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 14(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $14, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $14, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: movzbl 15(%rdi), %eax ; SSE41-NEXT: andl $15, %eax -; SSE41-NEXT: pinsrb $15, (%rax,%rcx), %xmm0 +; SSE41-NEXT: pinsrb $15, -24(%rsp,%rax), %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -1045,54 +1038,53 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* ; AVX-NEXT: movzbl (%rdi), %eax ; AVX-NEXT: andl $15, %eax ; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movzbl (%rax,%rcx), %eax +; AVX-NEXT: movzbl -24(%rsp,%rax), %eax ; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: movzbl 1(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $1, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 2(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 3(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 4(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 5(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 6(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 7(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 8(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 9(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 10(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 11(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 12(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 13(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 14(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: movzbl 15(%rdi), %eax ; AVX-NEXT: andl $15, %eax -; AVX-NEXT: vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT: vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0 ; AVX-NEXT: retq %p0 = getelementptr inbounds i8, i8* %i, i64 0 %p1 = getelementptr inbounds i8, i8* %i, i64 1 diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index d25117ca715c..fd4c30fb327b 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -1504,12 +1504,9 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) { ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1531,12 +1528,9 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) { ; AVX512BWVL: # %bb.0: # %entry ; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 ; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -1647,43 +1641,13 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) { ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: retq ; -; AVX512F-LABEL: trunc2x4i32_8i16: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2x4i32_8i16: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc2x4i32_8i16: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc2x4i32_8i16: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc2x4i32_8i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: retq entry: %0 = trunc <4 x i32> %a to <4 x i16> %1 = trunc <4 x i32> %b to <4 x i16> diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll index 94eadd8c1aaf..3ea65573fc70 100644 --- a/test/CodeGen/X86/vector-zext.ll +++ b/test/CodeGen/X86/vector-zext.ll @@ -3,7 +3,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW @@ -1911,11 +1912,28 @@ define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable ; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; SSE41-NEXT: retq ; -; AVX-LABEL: shuf_zext_8i16_to_4i32_offset1: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: retq +; AVX1-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: retq entry: %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8> %Z = bitcast <8 x i16> %B to <4 x i32> diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll index 985f6a861b93..3d4355e5f39c 100644 --- a/test/CodeGen/X86/vselect.ll +++ b/test/CodeGen/X86/vselect.ll @@ -182,7 +182,7 @@ define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) { define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) { ; SSE2-LABEL: test11: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,0,0,65535,65535,0] ; SSE2-NEXT: andps %xmm2, %xmm0 ; SSE2-NEXT: andnps %xmm1, %xmm2 ; SSE2-NEXT: orps %xmm2, %xmm0 @@ -190,12 +190,12 @@ define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) { ; ; SSE41-LABEL: test11: ; SSE41: # %bb.0: -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] ; SSE41-NEXT: retq ; ; AVX-LABEL: test11: ; AVX: # %bb.0: -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7] ; AVX-NEXT: retq %1 = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-dwp.s b/test/DebugInfo/X86/dwarfdump-str-offsets-dwp.s index 9158c60fbeb5..fb095e3ebc25 100644 --- a/test/DebugInfo/X86/dwarfdump-str-offsets-dwp.s +++ b/test/DebugInfo/X86/dwarfdump-str-offsets-dwp.s @@ -2,9 +2,10 @@ # RUN: llvm-dwarfdump -v %t.o | FileCheck %s # Test object to verify that dwarfdump handles dwp files with DWARF v5 string -# offset tables. We have 2 CUs and 2 TUs, where it is assumed that +# offset tables. We have 3 CUs and 2 TUs, where it is assumed that # CU1 and TU1 came from one object file, CU2 and TU2 from a second object -# file. +# file, and CU3 from a third object file that was compiled with +# -gdwarf-4. # .section .debug_str.dwo,"MS",@progbits,1 str_producer: @@ -25,56 +26,50 @@ str_TU2: .asciz "Type_Unit_2" str_TU2_type: .asciz "MyStruct_2" +str_CU3: + .asciz "Compile_Unit_3" +str_CU3_dir: + .asciz "/home/test/CU3" .section .debug_str_offsets.dwo,"",@progbits # Object files 1's portion of the .debug_str_offsets.dwo section. -.debug_str_offsets_object_file1: - -# CU1's contribution (from object file 1) -.debug_str_offsets_start_CU1: - .long .debug_str_offsets_end_CU1-.debug_str_offsets_base_CU1 +# CU1 and TU1 share a contribution to the string offsets table. +.debug_str_offsets_object_file1_start: + .long .debug_str_offsets_object_file1_end-.debug_str_offsets_base_1 .short 5 # DWARF version .short 0 # Padding -.debug_str_offsets_base_CU1: +.debug_str_offsets_base_1: .long str_producer-.debug_str.dwo .long str_CU1-.debug_str.dwo .long str_CU1_dir-.debug_str.dwo -.debug_str_offsets_end_CU1: - -# TU1's contribution (from object file 1) -.debug_str_offsets_start_TU1: - .long .debug_str_offsets_end_TU1-.debug_str_offsets_base_TU1 - .short 5 # DWARF version - .short 0 # Padding -.debug_str_offsets_base_TU1: .long str_TU1-.debug_str.dwo .long str_TU1_type-.debug_str.dwo -.debug_str_offsets_end_TU1: +.debug_str_offsets_object_file1_end: # Object files 2's portion of the .debug_str_offsets.dwo section. -.debug_str_offsets_object_file2: - -# CU2's contribution (from object file 2) -.debug_str_offsets_start_CU2: - .long .debug_str_offsets_end_CU2-.debug_str_offsets_base_CU2 +# CU2 and TU2 share a contribution to the string offsets table. +.debug_str_offsets_object_file2_start: + .long .debug_str_offsets_object_file2_end-.debug_str_offsets_base_2 .short 5 # DWARF version .short 0 # Padding -.debug_str_offsets_base_CU2: +.debug_str_offsets_base_2: .long str_producer-.debug_str.dwo .long str_CU2-.debug_str.dwo .long str_CU2_dir-.debug_str.dwo -.debug_str_offsets_end_CU2: - -# TU2's contribution (from object file 2) -.debug_str_offsets_start_TU2: - .long .debug_str_offsets_end_TU2-.debug_str_offsets_base_TU2 - .short 5 # DWARF version - .short 0 # Padding -.debug_str_offsets_base_TU2: .long str_TU2-.debug_str.dwo .long str_TU2_type-.debug_str.dwo -.debug_str_offsets_end_TU2: +.debug_str_offsets_object_file2_end: +# Object files 3's portion of the .debug_str_offsets.dwo section. +# This file is assumed to have been compiled with -gdwarf-4 and +# therefore contains a version 4 CU and a GNU format contribution +# to the .debug_str_offsets section. +.debug_str_offsets_object_file3_start: +.debug_str_offsets_base_3: + .long str_producer-.debug_str.dwo + .long str_CU3-.debug_str.dwo + .long str_CU3_dir-.debug_str.dwo +.debug_str_offsets_object_file3_end: # Abbrevs are shared for all compile and type units .section .debug_abbrev.dwo,"",@progbits @@ -85,8 +80,6 @@ str_TU2_type: .byte 0x1a # DW_FORM_strx .byte 0x03 # DW_AT_name .byte 0x1a # DW_FORM_strx - .byte 0x72 # DW_AT_str_offsets_base - .byte 0x17 # DW_FORM_sec_offset .byte 0x03 # DW_AT_name .byte 0x1a # DW_FORM_strx .byte 0x00 # EOM(1) @@ -96,8 +89,6 @@ str_TU2_type: .byte 0x01 # DW_CHILDREN_yes .byte 0x03 # DW_AT_name .byte 0x1a # DW_FORM_strx - .byte 0x72 # DW_AT_str_offsets_base - .byte 0x17 # DW_FORM_sec_offset .byte 0x00 # EOM(1) .byte 0x00 # EOM(2) .byte 0x03 # Abbrev code @@ -107,6 +98,17 @@ str_TU2_type: .byte 0x1a # DW_FORM_strx .byte 0x00 # EOM(1) .byte 0x00 # EOM(2) + .byte 0x04 # Abbrev code + .byte 0x11 # DW_TAG_compile_unit + .byte 0x00 # DW_CHILDREN_no + .byte 0x25 # DW_AT_producer + .short 0x3e82 # DW_FORM_GNU_str_index + .byte 0x03 # DW_AT_name + .short 0x3e82 # DW_FORM_GNU_str_index + .byte 0x03 # DW_AT_name + .short 0x3e82 # DW_FORM_GNU_str_index + .byte 0x00 # EOM(1) + .byte 0x00 # EOM(2) .byte 0x00 # EOM(3) abbrev_end: @@ -120,15 +122,11 @@ CU1_5_version: .byte 1 # DWARF Unit Type .byte 8 # Address Size (in bytes) .long .debug_abbrev.dwo # Offset Into Abbrev. Section -# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name, -# DW_AT_str_offsets and DW_AT_compdir. +# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name +# and DW_AT_compdir. .byte 1 # Abbreviation code .byte 0 # The index of the producer string .byte 1 # The index of the CU name string -# The DW_AT_str_offsets_base attribute for CU1 contains the offset of CU1's -# contribution relative to the start of object file 1's portion of the -# .debug_str_offsets section. - .long .debug_str_offsets_base_CU1-.debug_str_offsets_object_file1 .byte 2 # The index of the comp dir string .byte 0 # NULL CU1_5_end: @@ -140,19 +138,30 @@ CU2_5_version: .byte 1 # DWARF Unit Type .byte 8 # Address Size (in bytes) .long .debug_abbrev.dwo # Offset Into Abbrev. Section -# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name, -# DW_AT_str_offsets and DW_AT_compdir. +# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name +# and DW_AT_compdir. .byte 1 # Abbreviation code .byte 0 # The index of the producer string .byte 1 # The index of the CU name string -# The DW_AT_str_offsets_base attribute for CU2 contains the offset of CU2's -# contribution relative to the start of object file 2's portion of the -# .debug_str_offsets section. - .long .debug_str_offsets_base_CU2-.debug_str_offsets_object_file2 .byte 2 # The index of the comp dir string .byte 0 # NULL CU2_5_end: +CU3_4_start: + .long CU3_4_end-CU3_4_version # Length of Unit +CU3_4_version: + .short 4 # DWARF version number + .long .debug_abbrev.dwo # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) +# The compile-unit DIE, which has a DW_AT_producer, DW_AT_name +# and DW_AT_compdir. + .byte 4 # Abbreviation code + .byte 0 # The index of the producer string + .byte 1 # The index of the CU name string + .byte 2 # The index of the comp dir string + .byte 0 # NULL +CU3_4_end: + .section .debug_types.dwo,"",@progbits # DWARF v5 Type unit header. TU1_5_start: @@ -166,15 +175,11 @@ TU1_5_version: .long TU1_5_type-TU1_5_start # Type offset # The type-unit DIE, which has a name. .byte 2 # Abbreviation code - .byte 0 # Index of the unit type name string -# The DW_AT_str_offsets_base attribute for TU1 contains the offset of TU1's -# contribution relative to the start of object file 1's portion of the -# .debug_str_offsets section. - .long .debug_str_offsets_base_TU1-.debug_str_offsets_object_file1 + .byte 3 # Index of the unit type name string # The type DIE, which has a name. TU1_5_type: .byte 3 # Abbreviation code - .byte 1 # Index of the type name string + .byte 4 # Index of the type name string .byte 0 # NULL .byte 0 # NULL TU1_5_end: @@ -190,15 +195,11 @@ TU2_5_version: .long TU2_5_type-TU2_5_start # Type offset # The type-unit DIE, which has a name. .byte 2 # Abbreviation code - .byte 0 # Index of the unit type name string -# The DW_AT_str_offsets_base attribute for TU2 contains the offset of TU2's -# contribution relative to the start of object file 2's portion of the -# .debug_str_offsets section. - .long .debug_str_offsets_base_TU2-.debug_str_offsets_object_file2 + .byte 3 # Index of the unit type name string # The type DIE, which has a name. TU2_5_type: .byte 3 # Abbreviation code - .byte 1 # Index of the type name string + .byte 4 # Index of the type name string .byte 0 # NULL .byte 0 # NULL TU2_5_end: @@ -207,37 +208,45 @@ TU2_5_end: # The index header .long 2 # Version .long 3 # Columns of contribution matrix - .long 2 # number of units - .long 2 # number of hash buckets in table + .long 3 # number of units + .long 3 # number of hash buckets in table - # The signatures for both CUs. + # The signatures for all CUs. .quad 0xddeeaaddbbaabbee # signature 1 .quad 0xff00ffeeffaaff00 # signature 2 + .quad 0xf00df00df00df00d # signature 2 # The indexes for both CUs. .long 1 # index 1 .long 2 # index 2 - # The sections to which both CUs contribute. + .long 3 # index 3 + # The sections to which all CUs contribute. .long 1 # DW_SECT_INFO .long 3 # DW_SECT_ABBREV .long 6 # DW_SECT_STR_OFFSETS - # The starting offsets of both CU's contributions to info, + # The starting offsets of all CU's contributions to info, # abbrev and string offsets table. .long CU1_5_start-.debug_info.dwo .long 0 - .long .debug_str_offsets_object_file1-.debug_str_offsets.dwo + .long .debug_str_offsets_object_file1_start-.debug_str_offsets.dwo .long CU2_5_start-.debug_info.dwo .long 0 - .long .debug_str_offsets_object_file2-.debug_str_offsets.dwo + .long .debug_str_offsets_object_file2_start-.debug_str_offsets.dwo + .long CU3_4_start-.debug_info.dwo + .long 0 + .long .debug_str_offsets_object_file3_start-.debug_str_offsets.dwo - # The lengths of both CU's contributions to info, abbrev and + # The lengths of all CU's contributions to info, abbrev and # string offsets table. .long CU1_5_end-CU1_5_start .long abbrev_end-.debug_abbrev.dwo - .long .debug_str_offsets_end_CU1-.debug_str_offsets_start_CU1 + .long .debug_str_offsets_object_file1_end-.debug_str_offsets_object_file1_start .long CU2_5_end-CU2_5_start .long abbrev_end-.debug_abbrev.dwo - .long .debug_str_offsets_end_CU2-.debug_str_offsets_start_CU2 + .long .debug_str_offsets_object_file2_end-.debug_str_offsets_object_file2_start + .long CU3_4_end-CU3_4_start + .long abbrev_end-.debug_abbrev.dwo + .long .debug_str_offsets_object_file3_end-.debug_str_offsets_object_file3_start .section .debug_tu_index,"",@progbits # The index header @@ -261,19 +270,19 @@ TU2_5_end: # abbrev and string offsets table. .long TU1_5_start-.debug_types.dwo .long 0 - .long .debug_str_offsets_object_file1-.debug_str_offsets.dwo + .long .debug_str_offsets_object_file1_start-.debug_str_offsets.dwo .long TU2_5_start-.debug_types.dwo .long 0 - .long .debug_str_offsets_object_file2-.debug_str_offsets.dwo + .long .debug_str_offsets_object_file2_start-.debug_str_offsets.dwo # The lengths of both TU's contributions to info, abbrev and # string offsets table. .long TU1_5_end-TU1_5_start .long abbrev_end-.debug_abbrev.dwo - .long .debug_str_offsets_end_TU1-.debug_str_offsets_start_TU1 + .long .debug_str_offsets_object_file1_end-.debug_str_offsets_object_file1_start .long TU2_5_end-TU2_5_start .long abbrev_end-.debug_abbrev.dwo - .long .debug_str_offsets_end_TU2-.debug_str_offsets_start_TU2 + .long .debug_str_offsets_object_file2_end-.debug_str_offsets_object_file2_start # Verify that the correct strings from each unit are displayed and that the @@ -284,7 +293,6 @@ TU2_5_end: # CHECK: DW_TAG_compile_unit # CHECK-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer") # CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_1") -# CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008) # CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU1") # CHECK-NOT: NULL @@ -293,26 +301,23 @@ TU2_5_end: # CHECK: DW_TAG_compile_unit # CHECK-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer") # CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_2") -# CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008) # CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU2") # # CHECK: Type Unit # CHECK-NOT: NULL # CHECK: DW_TAG_type_unit -# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit_1") -# CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000001c) +# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000003) string = "Type_Unit_1") # CHECK-NOT: NULL # CHECK: DW_TAG_structure_type -# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct_1") +# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000004) string = "MyStruct_1") # # CHECK: Type Unit # CHECK-NOT: NULL # CHECK: DW_TAG_type_unit -# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit_2") -# CHECK-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000001c) +# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000003) string = "Type_Unit_2") # CHECK-NOT: NULL # CHECK: DW_TAG_structure_type -# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct_2") +# CHECK-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000004) string = "MyStruct_2") # Verify the correct offets of the compile and type units contributions in the # index tables. @@ -322,11 +327,11 @@ TU2_5_end: # CHECK: 1 0xddeeaaddbbaabbee [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) # CHECK-SAME: [0x00000000 # CHECK-NEXT: 2 0xff00ffeeffaaff00 [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) -# CHECK-SAME: [0x00000024 +# CHECK-SAME: [0x0000001c # CHECK: .debug_tu_index contents: # CHECK-NOT: contents: # CHECK: 1 0xeeaaddbbaabbeedd [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) # CHECK-SAME: [0x00000000 # CHECK-NEXT: 2 0x00ffeeffaaff00ff [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) [{{0x[0-9a-f]*, 0x[0-9a-f]*}}) -# CHECK: [0x00000024 +# CHECK: [0x0000001c diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-3.s b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-3.s index ebde22213728..0ff6fdbb0aa7 100644 --- a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-3.s +++ b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-3.s @@ -37,6 +37,8 @@ dwo_str_TU_5_type: .byte 0x01 # Abbrev code .byte 0x11 # DW_TAG_compile_unit .byte 0x00 # DW_CHILDREN_no + .byte 0x72 # DW_AT_str_offsets_base + .byte 0x17 # DW_FORM_sec_offset .byte 0x00 # EOM(1) .byte 0x00 # EOM(2) .byte 0x00 # EOM(3) @@ -54,13 +56,13 @@ CU1_5_version: .long .debug_abbrev # Offset Into Abbrev. Section # A compile-unit DIE, which has no attributes. .byte 1 # Abbreviation code + .long .debug_str_offsets_base0 CU1_5_end: .section .debug_str_offsets,"",@progbits # CU1's contribution # Invalid length .long 0xfffffffe - .long .debug_str_offsets_segment0_end-.debug_str_offsets_base0 .short 5 # DWARF version .short 0 # Padding .debug_str_offsets_base0: diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-4.s b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-4.s index 9c6094ebade1..36ac7124a9f0 100644 --- a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-4.s +++ b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-4.s @@ -15,6 +15,8 @@ str_CU1: .byte 0x01 # Abbrev code .byte 0x11 # DW_TAG_compile_unit .byte 0x00 # DW_CHILDREN_no + .byte 0x72 # DW_AT_str_offsets_base + .byte 0x17 # DW_FORM_sec_offset .byte 0x00 # EOM(1) .byte 0x00 # EOM(2) .byte 0x00 # EOM(3) @@ -32,6 +34,7 @@ CU1_5_version: .long .debug_abbrev # Offset Into Abbrev. Section # A compile-unit DIE, which has no attributes. .byte 1 # Abbreviation code + .long .debug_str_offsets_base0 CU1_5_end: # Every unit contributes to the string_offsets table. @@ -50,4 +53,4 @@ CU1_5_end: # INVALIDLENGTH: .debug_str_offsets contents: # INVALIDLENGTH-NOT: contents: -# INVALIDLENGTH: error: contribution to string offsets table in section .debug_str_offsets has invalid length. +# INVALIDLENGTH: error: invalid contribution to string offsets table in section .debug_str_offsets. diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-6.s b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-6.s new file mode 100644 index 000000000000..28c4a418d125 --- /dev/null +++ b/test/DebugInfo/X86/dwarfdump-str-offsets-invalid-6.s @@ -0,0 +1,94 @@ +# RUN: llvm-mc -triple x86_64-unknown-linux %s -filetype=obj -o %t.o +# RUN: llvm-dwarfdump -v %t.o | FileCheck --check-prefix=OVERLAP %s +# +# Test object to verify that llvm-dwarfdump handles an invalid string offsets +# table with overlapping contributions. + + .section .debug_str,"MS",@progbits,1 +str_producer: + .asciz "Handmade DWARF producer" +str_CU1: + .asciz "Compile_Unit_1" +str_CU1_dir: + .asciz "/home/test/CU1" +str_CU2: + .asciz "Compile_Unit_2" +str_CU2_dir: + .asciz "/home/test/CU2" +str_TU: + .asciz "Type_Unit" +str_TU_type: + .asciz "MyStruct" + + .section .debug_str.dwo,"MS",@progbits,1 +dwo_str_CU_5_producer: + .asciz "Handmade split DWARF producer" +dwo_str_CU_5_name: + .asciz "V5_split_compile_unit" +dwo_str_CU_5_comp_dir: + .asciz "/home/test/splitCU" +dwo_str_TU_5: + .asciz "V5_split_type_unit" +dwo_str_TU_5_type: + .asciz "V5_split_Mystruct" + +# A rudimentary abbrev section. + .section .debug_abbrev,"",@progbits + .byte 0x01 # Abbrev code + .byte 0x11 # DW_TAG_compile_unit + .byte 0x00 # DW_CHILDREN_no + .byte 0x72 # DW_AT_str_offsets_base + .byte 0x17 # DW_FORM_sec_offset + .byte 0x00 # EOM(1) + .byte 0x00 # EOM(2) + .byte 0x00 # EOM(3) + + .section .debug_info,"",@progbits +# DWARF v5 CU header. + .long CU1_5_end-CU1_5_version # Length of Unit +CU1_5_version: + .short 5 # DWARF version number + .byte 1 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section +# A compile-unit DIE, which has no attributes. + .byte 1 # Abbreviation code + .long .debug_str_offsets_base0 +CU1_5_end: + +# DWARF v5 CU header. + .long CU2_5_end-CU2_5_version # Length of Unit +CU2_5_version: + .short 5 # DWARF version number + .byte 1 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section +# A compile-unit DIE, which has no attributes. + .byte 1 # Abbreviation code + .long .debug_str_offsets_base1 +CU2_5_end: + + .section .debug_str_offsets,"",@progbits +# CU1's contribution + .long .debug_str_offsets_segment1_end-.debug_str_offsets_base0 + .short 5 # DWARF version + .short 0 # Padding +.debug_str_offsets_base0: + .long str_producer + .long str_CU1 + .long str_CU1_dir +.debug_str_offsets_segment0_end: +# CU2's contribution +# Overlapping with CU1's contribution + .long .debug_str_offsets_segment1_end-.debug_str_offsets_base1 + .short 5 # DWARF version + .short 0 # Padding +.debug_str_offsets_base1: + .long str_producer + .long str_CU2 + .long str_CU2_dir +.debug_str_offsets_segment1_end: + +# OVERLAP: .debug_str_offsets contents: +# OVERLAP-NOT: contents: +# OVERLAP: error: overlapping contributions to string offsets table in section .debug_str_offsets. diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets-macho.s b/test/DebugInfo/X86/dwarfdump-str-offsets-macho.s index 34f151d37d95..f8f48ea13882 100644 --- a/test/DebugInfo/X86/dwarfdump-str-offsets-macho.s +++ b/test/DebugInfo/X86/dwarfdump-str-offsets-macho.s @@ -43,14 +43,17 @@ Ldebug_str_offsets_base0: .long str_Variable2 .long str_Variable3 Ldebug_str_offsets_segment0_end: -# CU2's contribution - .long Ldebug_str_offsets_segment1_end-Ldebug_str_offsets_base1 +# A 4-byte gap. + .long 0 +# CU2's contribution (DWARF64 format) + .long 0xffffffff + .quad Ldebug_str_offsets_segment1_end-Ldebug_str_offsets_base1 .short 5 # DWARF version .short 0 # Padding Ldebug_str_offsets_base1: - .long str_producer - .long str_CU2 - .long str_CU2_dir + .quad str_producer + .quad str_CU2 + .quad str_CU2_dir Ldebug_str_offsets_segment1_end: # The TU's contribution .long Ldebug_str_offsets_segment2_end-Ldebug_str_offsets_base2 @@ -234,20 +237,20 @@ TU_5_end: # COMMON: DW_TAG_compile_unit # COMMON-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer") # COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_2") -# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000002c) +# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000038) # COMMON-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU2") # # The type unit # COMMON: .debug_types contents: # COMMON: DW_TAG_type_unit # COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit") -# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000040) +# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000058) # COMMON: DW_TAG_structure_type # COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct") # # The .debug_str_offsets section # COMMON: .debug_str_offsets contents: -# COMMON-NEXT: 0x00000000: Contribution size = 28, Version = 5 +# COMMON-NEXT: 0x00000000: Contribution size = 28, Format = DWARF32, Version = 5 # COMMON-NEXT: 0x00000008: 00000000 "Handmade DWARF producer" # COMMON-NEXT: 0x0000000c: 00000018 "Compile_Unit_1" # COMMON-NEXT: 0x00000010: 00000027 "/home/test/CU1" @@ -255,10 +258,11 @@ TU_5_end: # COMMON-NEXT: 0x00000018: 0000006e "MyVar1" # COMMON-NEXT: 0x0000001c: 00000075 "MyVar2" # COMMON-NEXT: 0x00000020: 0000007c "MyVar3" -# COMMON-NEXT: 0x00000024: Contribution size = 12, Version = 5 -# COMMON-NEXT: 0x0000002c: 00000000 "Handmade DWARF producer" -# COMMON-NEXT: 0x00000030: 00000036 "Compile_Unit_2" -# COMMON-NEXT: 0x00000034: 00000045 "/home/test/CU2" -# COMMON-NEXT: 0x00000038: Contribution size = 8, Version = 5 -# COMMON-NEXT: 0x00000040: 00000054 "Type_Unit" -# COMMON-NEXT: 0x00000044: 0000005e "MyStruct" +# COMMON-NEXT: 0x00000024: Gap, length = 4 +# COMMON-NEXT: 0x00000028: Contribution size = 24, Format = DWARF64, Version = 5 +# COMMON-NEXT: 0x00000038: 00000000 "Handmade DWARF producer" +# COMMON-NEXT: 0x00000040: 00000036 "Compile_Unit_2" +# COMMON-NEXT: 0x00000048: 00000045 "/home/test/CU2" +# COMMON-NEXT: 0x00000050: Contribution size = 8, Format = DWARF32, Version = 5 +# COMMON-NEXT: 0x00000058: 00000054 "Type_Unit" +# COMMON-NEXT: 0x0000005c: 0000005e "MyStruct" diff --git a/test/DebugInfo/X86/dwarfdump-str-offsets.s b/test/DebugInfo/X86/dwarfdump-str-offsets.s index 5fbef186af60..363775a59f50 100644 --- a/test/DebugInfo/X86/dwarfdump-str-offsets.s +++ b/test/DebugInfo/X86/dwarfdump-str-offsets.s @@ -44,14 +44,17 @@ str_Variable3: .long str_Variable2 .long str_Variable3 .debug_str_offsets_segment0_end: -# CU2's contribution - .long .debug_str_offsets_segment1_end-.debug_str_offsets_base1 +# A 4-byte gap. + .long 0 +# CU2's contribution in DWARF64 format + .long 0xffffffff + .quad .debug_str_offsets_segment1_end-.debug_str_offsets_base1 .short 5 # DWARF version .short 0 # Padding .debug_str_offsets_base1: - .long str_producer - .long str_CU2 - .long str_CU2_dir + .quad str_producer + .quad str_CU2 + .quad str_CU2_dir .debug_str_offsets_segment1_end: # The TU's contribution .long .debug_str_offsets_segment2_end-.debug_str_offsets_base2 @@ -75,7 +78,7 @@ dwo_str_TU_5_type: .asciz "V5_split_Mystruct" .section .debug_str_offsets.dwo,"",@progbits -# The split CU's contribution +# One contribution only in a .dwo file .long .debug_dwo_str_offsets_segment0_end-.debug_dwo_str_offsets_base0 .short 5 # DWARF version .short 0 # Padding @@ -83,15 +86,9 @@ dwo_str_TU_5_type: .long dwo_str_CU_5_producer-.debug_str.dwo .long dwo_str_CU_5_name-.debug_str.dwo .long dwo_str_CU_5_comp_dir-.debug_str.dwo -.debug_dwo_str_offsets_segment0_end: -# The split TU's contribution - .long .debug_dwo_str_offsets_segment1_end-.debug_dwo_str_offsets_base1 - .short 5 # DWARF version - .short 0 # Padding -.debug_dwo_str_offsets_base1: .long dwo_str_TU_5-.debug_str.dwo .long dwo_str_TU_5_type-.debug_str.dwo -.debug_dwo_str_offsets_segment1_end: +.debug_dwo_str_offsets_segment0_end: # All CUs/TUs use the same abbrev section for simplicity. .section .debug_abbrev,"",@progbits @@ -163,8 +160,6 @@ dwo_str_TU_5_type: .byte 0x1a # DW_FORM_strx .byte 0x03 # DW_AT_name .byte 0x1a # DW_FORM_strx - .byte 0x72 # DW_AT_str_offsets_base - .byte 0x17 # DW_FORM_sec_offset .byte 0x1b # DW_AT_comp_dir .byte 0x1a # DW_FORM_strx .byte 0x00 # EOM(1) @@ -174,8 +169,6 @@ dwo_str_TU_5_type: .byte 0x01 # DW_CHILDREN_yes .byte 0x03 # DW_AT_name .byte 0x1a # DW_FORM_strx - .byte 0x72 # DW_AT_str_offsets_base - .byte 0x17 # DW_FORM_sec_offset .byte 0x00 # EOM(1) .byte 0x00 # EOM(2) .byte 0x03 # Abbrev code @@ -275,7 +268,6 @@ CU_split_5_version: .byte 1 # Abbreviation code .byte 0 # The index of the producer string .byte 1 # The index of the CU name string - .long .debug_dwo_str_offsets_base0-.debug_str_offsets.dwo .byte 2 # The index of the comp dir string .byte 0 # NULL CU_split_5_end: @@ -294,12 +286,11 @@ TU_split_5_version: .long TU_split_5_type-TU_split_5_start # Type offset # The type-unit DIE, which has a name. .byte 2 # Abbreviation code - .byte 0 # The index of the type unit name string - .long .debug_dwo_str_offsets_base1-.debug_str_offsets.dwo + .byte 3 # The index of the type unit name string # The type DIE, which has a name. TU_split_5_type: .byte 3 # Abbreviation code - .byte 1 # The index of the type name string + .byte 4 # The index of the type name string .byte 0 # NULL .byte 0 # NULL TU_split_5_end: @@ -340,7 +331,7 @@ TU_split_5_end: # COMMON: DW_TAG_compile_unit # COMMON-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade DWARF producer") # COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "Compile_Unit_2") -# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000002c) +# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000038) # COMMON-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/CU2") # # The split CU @@ -349,28 +340,25 @@ TU_split_5_end: # SPLIT: DW_TAG_compile_unit # SPLIT-NEXT: DW_AT_producer [DW_FORM_strx] ( indexed (00000000) string = "Handmade split DWARF producer") # SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_compile_unit") -# SPLIT-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008) # SPLIT-NEXT: DW_AT_comp_dir [DW_FORM_strx] ( indexed (00000002) string = "/home/test/splitCU") # # The type unit # COMMON: .debug_types contents: # COMMON: DW_TAG_type_unit # COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "Type_Unit") -# COMMON-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000040) # COMMON: DW_TAG_structure_type # COMMON-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "MyStruct") # # The split type unit # SPLIT: .debug_types.dwo contents: # SPLIT: DW_TAG_type_unit -# SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000000) string = "V5_split_type_unit") -# SPLIT-NEXT: DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x0000001c) +# SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000003) string = "V5_split_type_unit") # SPLIT: DW_TAG_structure_type -# SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000001) string = "V5_split_Mystruct") +# SPLIT-NEXT: DW_AT_name [DW_FORM_strx] ( indexed (00000004) string = "V5_split_Mystruct") # # The .debug_str_offsets section # COMMON: .debug_str_offsets contents: -# COMMON-NEXT: 0x00000000: Contribution size = 28, Version = 5 +# COMMON-NEXT: 0x00000000: Contribution size = 28, Format = DWARF32, Version = 5 # COMMON-NEXT: 0x00000008: 00000000 "Handmade DWARF producer" # COMMON-NEXT: 0x0000000c: 00000018 "Compile_Unit_1" # COMMON-NEXT: 0x00000010: 00000027 "/home/test/CU1" @@ -378,19 +366,19 @@ TU_split_5_end: # COMMON-NEXT: 0x00000018: 0000006e "MyVar1" # COMMON-NEXT: 0x0000001c: 00000075 "MyVar2" # COMMON-NEXT: 0x00000020: 0000007c "MyVar3" -# COMMON-NEXT: 0x00000024: Contribution size = 12, Version = 5 -# COMMON-NEXT: 0x0000002c: 00000000 "Handmade DWARF producer" -# COMMON-NEXT: 0x00000030: 00000036 "Compile_Unit_2" -# COMMON-NEXT: 0x00000034: 00000045 "/home/test/CU2" -# COMMON-NEXT: 0x00000038: Contribution size = 8, Version = 5 -# COMMON-NEXT: 0x00000040: 00000054 "Type_Unit" -# COMMON-NEXT: 0x00000044: 0000005e "MyStruct" +# COMMON-NEXT: Gap, length = 4 +# COMMON-NEXT: 0x00000028: Contribution size = 24, Format = DWARF64, Version = 5 +# COMMON-NEXT: 0x00000038: 00000000 "Handmade DWARF producer" +# COMMON-NEXT: 0x00000040: 00000036 "Compile_Unit_2" +# COMMON-NEXT: 0x00000048: 00000045 "/home/test/CU2" +# COMMON-NEXT: 0x00000050: Contribution size = 8, Format = DWARF32, Version = 5 +# COMMON-NEXT: 0x00000058: 00000054 "Type_Unit" +# COMMON-NEXT: 0x0000005c: 0000005e "MyStruct" # # SPLIT: .debug_str_offsets.dwo contents: -# SPLIT-NEXT: 0x00000000: Contribution size = 12, Version = 5 +# SPLIT-NEXT: 0x00000000: Contribution size = 20, Format = DWARF32, Version = 5 # SPLIT-NEXT: 0x00000008: 00000000 "Handmade split DWARF producer" # SPLIT-NEXT: 0x0000000c: 0000001e "V5_split_compile_unit" # SPLIT-NEXT: 0x00000010: 00000034 "/home/test/splitCU" -# SPLIT-NEXT: 0x00000014: Contribution size = 8, Version = 5 -# SPLIT-NEXT: 0x0000001c: 00000047 "V5_split_type_unit" -# SPLIT-NEXT: 0x00000020: 0000005a "V5_split_Mystruct" +# SPLIT-NEXT: 0x00000014: 00000047 "V5_split_type_unit" +# SPLIT-NEXT: 0x00000018: 0000005a "V5_split_Mystruct" diff --git a/test/Instrumentation/HWAddressSanitizer/basic.ll b/test/Instrumentation/HWAddressSanitizer/basic.ll index db1f52fc56e9..a5e49062669e 100644 --- a/test/Instrumentation/HWAddressSanitizer/basic.ll +++ b/test/Instrumentation/HWAddressSanitizer/basic.ll @@ -1,6 +1,7 @@ ; Test basic address sanitizer instrumentation. ; -; RUN: opt < %s -hwasan -S | FileCheck %s +; RUN: opt < %s -hwasan -hwasan-recover=0 -S | FileCheck %s --check-prefixes=CHECK,ABORT +; RUN: opt < %s -hwasan -hwasan-recover=1 -S | FileCheck %s --check-prefixes=CHECK,RECOVER target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-android" @@ -17,8 +18,10 @@ define i8 @test_load8(i8* %a) sanitize_hwaddress { ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]] ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}} -; CHECK: call void asm sideeffect "hlt #256", "{x0}"(i64 %[[A]]) -; CHECK: br label +; ABORT: call void asm sideeffect "hlt #256", "{x0}"(i64 %[[A]]) +; ABORT: unreachable +; RECOVER: call void asm sideeffect "hlt #288", "{x0}"(i64 %[[A]]) +; RECOVER: br label ; CHECK: %[[G:[^ ]*]] = load i8, i8* %a, align 4 ; CHECK: ret i8 %[[G]] @@ -40,8 +43,10 @@ define i16 @test_load16(i16* %a) sanitize_hwaddress { ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]] ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}} -; CHECK: call void asm sideeffect "hlt #257", "{x0}"(i64 %[[A]]) -; CHECK: br label +; ABORT: call void asm sideeffect "hlt #257", "{x0}"(i64 %[[A]]) +; ABORT: unreachable +; RECOVER: call void asm sideeffect "hlt #289", "{x0}"(i64 %[[A]]) +; RECOVER: br label ; CHECK: %[[G:[^ ]*]] = load i16, i16* %a, align 4 ; CHECK: ret i16 %[[G]] @@ -63,8 +68,10 @@ define i32 @test_load32(i32* %a) sanitize_hwaddress { ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]] ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}} -; CHECK: call void asm sideeffect "hlt #258", "{x0}"(i64 %[[A]]) -; CHECK: br label +; ABORT: call void asm sideeffect "hlt #258", "{x0}"(i64 %[[A]]) +; ABORT: unreachable +; RECOVER: call void asm sideeffect "hlt #290", "{x0}"(i64 %[[A]]) +; RECOVER: br label ; CHECK: %[[G:[^ ]*]] = load i32, i32* %a, align 4 ; CHECK: ret i32 %[[G]] @@ -86,8 +93,10 @@ define i64 @test_load64(i64* %a) sanitize_hwaddress { ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]] ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}} -; CHECK: call void asm sideeffect "hlt #259", "{x0}"(i64 %[[A]]) -; CHECK: br label +; ABORT: call void asm sideeffect "hlt #259", "{x0}"(i64 %[[A]]) +; ABORT: unreachable +; RECOVER: call void asm sideeffect "hlt #291", "{x0}"(i64 %[[A]]) +; RECOVER: br label ; CHECK: %[[G:[^ ]*]] = load i64, i64* %a, align 8 ; CHECK: ret i64 %[[G]] @@ -109,8 +118,10 @@ define i128 @test_load128(i128* %a) sanitize_hwaddress { ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]] ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}} -; CHECK: call void asm sideeffect "hlt #260", "{x0}"(i64 %[[A]]) -; CHECK: br label +; ABORT: call void asm sideeffect "hlt #260", "{x0}"(i64 %[[A]]) +; ABORT: unreachable +; RECOVER: call void asm sideeffect "hlt #292", "{x0}"(i64 %[[A]]) +; RECOVER: br label ; CHECK: %[[G:[^ ]*]] = load i128, i128* %a, align 16 ; CHECK: ret i128 %[[G]] @@ -123,7 +134,8 @@ entry: define i40 @test_load40(i40* %a) sanitize_hwaddress { ; CHECK-LABEL: @test_load40( ; CHECK: %[[A:[^ ]*]] = ptrtoint i40* %a to i64 -; CHECK: call void @__hwasan_load(i64 %[[A]], i64 5) +; ABORT: call void @__hwasan_load(i64 %[[A]], i64 5) +; RECOVER: call void @__hwasan_load_noabort(i64 %[[A]], i64 5) ; CHECK: %[[B:[^ ]*]] = load i40, i40* %a ; CHECK: ret i40 %[[B]] @@ -144,8 +156,10 @@ define void @test_store8(i8* %a, i8 %b) sanitize_hwaddress { ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]] ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}} -; CHECK: call void asm sideeffect "hlt #272", "{x0}"(i64 %[[A]]) -; CHECK: br label +; ABORT: call void asm sideeffect "hlt #272", "{x0}"(i64 %[[A]]) +; ABORT: unreachable +; RECOVER: call void asm sideeffect "hlt #304", "{x0}"(i64 %[[A]]) +; RECOVER: br label ; CHECK: store i8 %b, i8* %a, align 4 ; CHECK: ret void @@ -167,8 +181,10 @@ define void @test_store16(i16* %a, i16 %b) sanitize_hwaddress { ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]] ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}} -; CHECK: call void asm sideeffect "hlt #273", "{x0}"(i64 %[[A]]) -; CHECK: br label +; ABORT: call void asm sideeffect "hlt #273", "{x0}"(i64 %[[A]]) +; ABORT: unreachable +; RECOVER: call void asm sideeffect "hlt #305", "{x0}"(i64 %[[A]]) +; RECOVER: br label ; CHECK: store i16 %b, i16* %a, align 4 ; CHECK: ret void @@ -190,8 +206,10 @@ define void @test_store32(i32* %a, i32 %b) sanitize_hwaddress { ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]] ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}} -; CHECK: call void asm sideeffect "hlt #274", "{x0}"(i64 %[[A]]) -; CHECK: br label +; ABORT: call void asm sideeffect "hlt #274", "{x0}"(i64 %[[A]]) +; ABORT: unreachable +; RECOVER: call void asm sideeffect "hlt #306", "{x0}"(i64 %[[A]]) +; RECOVER: br label ; CHECK: store i32 %b, i32* %a, align 4 ; CHECK: ret void @@ -213,8 +231,10 @@ define void @test_store64(i64* %a, i64 %b) sanitize_hwaddress { ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]] ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}} -; CHECK: call void asm sideeffect "hlt #275", "{x0}"(i64 %[[A]]) -; CHECK: br label +; ABORT: call void asm sideeffect "hlt #275", "{x0}"(i64 %[[A]]) +; ABORT: unreachable +; RECOVER: call void asm sideeffect "hlt #307", "{x0}"(i64 %[[A]]) +; RECOVER: br label ; CHECK: store i64 %b, i64* %a, align 8 ; CHECK: ret void @@ -236,8 +256,10 @@ define void @test_store128(i128* %a, i128 %b) sanitize_hwaddress { ; CHECK: %[[F:[^ ]*]] = icmp ne i8 %[[PTRTAG]], %[[MEMTAG]] ; CHECK: br i1 %[[F]], label {{.*}}, label {{.*}}, !prof {{.*}} -; CHECK: call void asm sideeffect "hlt #276", "{x0}"(i64 %[[A]]) -; CHECK: br label +; ABORT: call void asm sideeffect "hlt #276", "{x0}"(i64 %[[A]]) +; ABORT: unreachable +; RECOVER: call void asm sideeffect "hlt #308", "{x0}"(i64 %[[A]]) +; RECOVER: br label ; CHECK: store i128 %b, i128* %a, align 16 ; CHECK: ret void @@ -250,7 +272,8 @@ entry: define void @test_store40(i40* %a, i40 %b) sanitize_hwaddress { ; CHECK-LABEL: @test_store40( ; CHECK: %[[A:[^ ]*]] = ptrtoint i40* %a to i64 -; CHECK: call void @__hwasan_store(i64 %[[A]], i64 5) +; ABORT: call void @__hwasan_store(i64 %[[A]], i64 5) +; RECOVER: call void @__hwasan_store_noabort(i64 %[[A]], i64 5) ; CHECK: store i40 %b, i40* %a ; CHECK: ret void @@ -262,7 +285,8 @@ entry: define void @test_store_unaligned(i64* %a, i64 %b) sanitize_hwaddress { ; CHECK-LABEL: @test_store_unaligned( ; CHECK: %[[A:[^ ]*]] = ptrtoint i64* %a to i64 -; CHECK: call void @__hwasan_store(i64 %[[A]], i64 8) +; ABORT: call void @__hwasan_store(i64 %[[A]], i64 8) +; RECOVER: call void @__hwasan_store_noabort(i64 %[[A]], i64 8) ; CHECK: store i64 %b, i64* %a, align 4 ; CHECK: ret void diff --git a/test/Instrumentation/HWAddressSanitizer/with-calls.ll b/test/Instrumentation/HWAddressSanitizer/with-calls.ll index be3b33d4f8d6..0a1713b83912 100644 --- a/test/Instrumentation/HWAddressSanitizer/with-calls.ll +++ b/test/Instrumentation/HWAddressSanitizer/with-calls.ll @@ -1,6 +1,7 @@ ; Test basic address sanitizer instrumentation. ; -; RUN: opt < %s -hwasan -hwasan-instrument-with-calls -S | FileCheck %s +; RUN: opt < %s -hwasan -hwasan-instrument-with-calls -S | FileCheck %s --check-prefixes=CHECK,ABORT +; RUN: opt < %s -hwasan -hwasan-instrument-with-calls -hwasan-recover=1 -S | FileCheck %s --check-prefixes=CHECK,RECOVER target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-android" @@ -8,7 +9,8 @@ target triple = "aarch64--linux-android" define i8 @test_load8(i8* %a) sanitize_hwaddress { ; CHECK-LABEL: @test_load8( ; CHECK: %[[A:[^ ]*]] = ptrtoint i8* %a to i64 -; CHECK: call void @__hwasan_load1(i64 %[[A]]) +; ABORT: call void @__hwasan_load1(i64 %[[A]]) +; RECOVER: call void @__hwasan_load1_noabort(i64 %[[A]]) ; CHECK: %[[B:[^ ]*]] = load i8, i8* %a ; CHECK: ret i8 %[[B]] @@ -20,7 +22,8 @@ entry: define i16 @test_load16(i16* %a) sanitize_hwaddress { ; CHECK-LABEL: @test_load16( ; CHECK: %[[A:[^ ]*]] = ptrtoint i16* %a to i64 -; CHECK: call void @__hwasan_load2(i64 %[[A]]) +; ABORT: call void @__hwasan_load2(i64 %[[A]]) +; RECOVER: call void @__hwasan_load2_noabort(i64 %[[A]]) ; CHECK: %[[B:[^ ]*]] = load i16, i16* %a ; CHECK: ret i16 %[[B]] @@ -32,7 +35,8 @@ entry: define i32 @test_load32(i32* %a) sanitize_hwaddress { ; CHECK-LABEL: @test_load32( ; CHECK: %[[A:[^ ]*]] = ptrtoint i32* %a to i64 -; CHECK: call void @__hwasan_load4(i64 %[[A]]) +; ABORT: call void @__hwasan_load4(i64 %[[A]]) +; RECOVER: call void @__hwasan_load4_noabort(i64 %[[A]]) ; CHECK: %[[B:[^ ]*]] = load i32, i32* %a ; CHECK: ret i32 %[[B]] @@ -44,7 +48,8 @@ entry: define i64 @test_load64(i64* %a) sanitize_hwaddress { ; CHECK-LABEL: @test_load64( ; CHECK: %[[A:[^ ]*]] = ptrtoint i64* %a to i64 -; CHECK: call void @__hwasan_load8(i64 %[[A]]) +; ABORT: call void @__hwasan_load8(i64 %[[A]]) +; RECOVER: call void @__hwasan_load8_noabort(i64 %[[A]]) ; CHECK: %[[B:[^ ]*]] = load i64, i64* %a ; CHECK: ret i64 %[[B]] @@ -56,7 +61,8 @@ entry: define i128 @test_load128(i128* %a) sanitize_hwaddress { ; CHECK-LABEL: @test_load128( ; CHECK: %[[A:[^ ]*]] = ptrtoint i128* %a to i64 -; CHECK: call void @__hwasan_load16(i64 %[[A]]) +; ABORT: call void @__hwasan_load16(i64 %[[A]]) +; RECOVER: call void @__hwasan_load16_noabort(i64 %[[A]]) ; CHECK: %[[B:[^ ]*]] = load i128, i128* %a ; CHECK: ret i128 %[[B]] @@ -68,7 +74,8 @@ entry: define i40 @test_load40(i40* %a) sanitize_hwaddress { ; CHECK-LABEL: @test_load40( ; CHECK: %[[A:[^ ]*]] = ptrtoint i40* %a to i64 -; CHECK: call void @__hwasan_load(i64 %[[A]], i64 5) +; ABORT: call void @__hwasan_load(i64 %[[A]], i64 5) +; RECOVER: call void @__hwasan_load_noabort(i64 %[[A]], i64 5) ; CHECK: %[[B:[^ ]*]] = load i40, i40* %a ; CHECK: ret i40 %[[B]] @@ -80,7 +87,8 @@ entry: define void @test_store8(i8* %a, i8 %b) sanitize_hwaddress { ; CHECK-LABEL: @test_store8( ; CHECK: %[[A:[^ ]*]] = ptrtoint i8* %a to i64 -; CHECK: call void @__hwasan_store1(i64 %[[A]]) +; ABORT: call void @__hwasan_store1(i64 %[[A]]) +; RECOVER: call void @__hwasan_store1_noabort(i64 %[[A]]) ; CHECK: store i8 %b, i8* %a ; CHECK: ret void @@ -92,7 +100,8 @@ entry: define void @test_store16(i16* %a, i16 %b) sanitize_hwaddress { ; CHECK-LABEL: @test_store16( ; CHECK: %[[A:[^ ]*]] = ptrtoint i16* %a to i64 -; CHECK: call void @__hwasan_store2(i64 %[[A]]) +; ABORT: call void @__hwasan_store2(i64 %[[A]]) +; RECOVER: call void @__hwasan_store2_noabort(i64 %[[A]]) ; CHECK: store i16 %b, i16* %a ; CHECK: ret void @@ -104,7 +113,8 @@ entry: define void @test_store32(i32* %a, i32 %b) sanitize_hwaddress { ; CHECK-LABEL: @test_store32( ; CHECK: %[[A:[^ ]*]] = ptrtoint i32* %a to i64 -; CHECK: call void @__hwasan_store4(i64 %[[A]]) +; ABORT: call void @__hwasan_store4(i64 %[[A]]) +; RECOVER: call void @__hwasan_store4_noabort(i64 %[[A]]) ; CHECK: store i32 %b, i32* %a ; CHECK: ret void @@ -116,7 +126,8 @@ entry: define void @test_store64(i64* %a, i64 %b) sanitize_hwaddress { ; CHECK-LABEL: @test_store64( ; CHECK: %[[A:[^ ]*]] = ptrtoint i64* %a to i64 -; CHECK: call void @__hwasan_store8(i64 %[[A]]) +; ABORT: call void @__hwasan_store8(i64 %[[A]]) +; RECOVER: call void @__hwasan_store8_noabort(i64 %[[A]]) ; CHECK: store i64 %b, i64* %a ; CHECK: ret void @@ -128,7 +139,8 @@ entry: define void @test_store128(i128* %a, i128 %b) sanitize_hwaddress { ; CHECK-LABEL: @test_store128( ; CHECK: %[[A:[^ ]*]] = ptrtoint i128* %a to i64 -; CHECK: call void @__hwasan_store16(i64 %[[A]]) +; ABORT: call void @__hwasan_store16(i64 %[[A]]) +; RECOVER: call void @__hwasan_store16_noabort(i64 %[[A]]) ; CHECK: store i128 %b, i128* %a ; CHECK: ret void @@ -140,7 +152,8 @@ entry: define void @test_store40(i40* %a, i40 %b) sanitize_hwaddress { ; CHECK-LABEL: @test_store40( ; CHECK: %[[A:[^ ]*]] = ptrtoint i40* %a to i64 -; CHECK: call void @__hwasan_store(i64 %[[A]], i64 5) +; ABORT: call void @__hwasan_store(i64 %[[A]], i64 5) +; RECOVER: call void @__hwasan_store_noabort(i64 %[[A]], i64 5) ; CHECK: store i40 %b, i40* %a ; CHECK: ret void diff --git a/test/MC/AArch64/arm64-system-encoding.s b/test/MC/AArch64/arm64-system-encoding.s index ef4037b7bf3f..19ed248db3a8 100644 --- a/test/MC/AArch64/arm64-system-encoding.s +++ b/test/MC/AArch64/arm64-system-encoding.s @@ -1,4 +1,5 @@ ; RUN: not llvm-mc -triple arm64-apple-darwin -show-encoding < %s 2> %t | FileCheck %s +; RUN: not llvm-mc -triple arm64-apple-darwin -mattr=+v8.3a -show-encoding < %s 2> %t | FileCheck %s --check-prefix=CHECK-V83 ; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s foo: @@ -233,6 +234,7 @@ foo: mrs x3, AMAIR_EL3 mrs x3, CCSIDR_EL1 mrs x3, CLIDR_EL1 + mrs x3, CCSIDR2_EL1 mrs x3, CNTFRQ_EL0 mrs x3, CNTHCTL_EL2 mrs x3, CNTHP_CTL_EL2 @@ -418,6 +420,7 @@ foo: ; CHECK: mrs x3, AMAIR_EL3 ; encoding: [0x03,0xa3,0x3e,0xd5] ; CHECK: mrs x3, CCSIDR_EL1 ; encoding: [0x03,0x00,0x39,0xd5] ; CHECK: mrs x3, CLIDR_EL1 ; encoding: [0x23,0x00,0x39,0xd5] +; CHECK-V83: mrs x3, CCSIDR2_EL1 ; encoding: [0x43,0x00,0x39,0xd5] ; CHECK: mrs x3, CNTFRQ_EL0 ; encoding: [0x03,0xe0,0x3b,0xd5] ; CHECK: mrs x3, CNTHCTL_EL2 ; encoding: [0x03,0xe1,0x3c,0xd5] ; CHECK: mrs x3, CNTHP_CTL_EL2 ; encoding: [0x23,0xe2,0x3c,0xd5] diff --git a/test/MC/AArch64/basic-a64-diagnostics.s b/test/MC/AArch64/basic-a64-diagnostics.s index 597adfb3a25b..3b791bef0b6f 100644 --- a/test/MC/AArch64/basic-a64-diagnostics.s +++ b/test/MC/AArch64/basic-a64-diagnostics.s @@ -3504,6 +3504,7 @@ msr MIDR_EL1, x12 msr CCSIDR_EL1, x12 msr CLIDR_EL1, x12 + msr CCSIDR2_EL1, x12 msr CTR_EL0, x12 msr MPIDR_EL1, x12 msr REVIDR_EL1, x12 @@ -3572,6 +3573,9 @@ // CHECK-ERROR-NEXT: msr CLIDR_EL1, x12 // CHECK-ERROR-NEXT: ^ // CHECK-ERROR-NEXT: error: expected writable system register or pstate +// CHECK-ERROR-NEXT: msr CCSIDR2_EL1, x12 +// CHECK-ERROR-NEXT: ^ +// CHECK-ERROR-NEXT: error: expected writable system register or pstate // CHECK-ERROR-NEXT: msr CTR_EL0, x12 // CHECK-ERROR-NEXT: ^ // CHECK-ERROR-NEXT: error: expected writable system register or pstate diff --git a/test/MC/AArch64/dot-req.s b/test/MC/AArch64/dot-req.s index a557f0c67589..582674b1f8d9 100644 --- a/test/MC/AArch64/dot-req.s +++ b/test/MC/AArch64/dot-req.s @@ -37,3 +37,13 @@ bar: // CHECK: fmov d2, d3 // encoding: [0x62,0x40,0x60,0x1e] // CHECK: ldr q2, [sp] // encoding: [0xe2,0x03,0xc0,0x3d] // CHECK: mov v0.8b, v1.8b // encoding: [0x20,0x1c,0xa1,0x0e] + + peter .req x6 + add peter, x0, x0 + .unreq peter +// CHECK: add x6, x0, x0 + + zoe .req x6 + add zoe, x0, x0 + .unreq zoe +// CHECK: add x6, x0, x0 diff --git a/test/MC/AMDGPU/ds.s b/test/MC/AMDGPU/ds.s index ef36a98f746a..b06101a4051b 100644 --- a/test/MC/AMDGPU/ds.s +++ b/test/MC/AMDGPU/ds.s @@ -511,6 +511,10 @@ ds_swizzle_b32 v8, v2 // SICI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0xd4,0xd8,0x02,0x00,0x00,0x08] // VI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] +ds_swizzle_b32 v8, v2 gds +// SICI: ds_swizzle_b32 v8, v2 gds ; encoding: [0x00,0x00,0xd6,0xd8,0x02,0x00,0x00,0x08] +// VI: ds_swizzle_b32 v8, v2 gds ; encoding: [0x00,0x00,0x7b,0xd8,0x02,0x00,0x00,0x08] + ds_swizzle_b32 v8, v2 offset:0xFFFF // SICI: ds_swizzle_b32 v8, v2 offset:65535 ; encoding: [0xff,0xff,0xd4,0xd8,0x02,0x00,0x00,0x08] // VI: ds_swizzle_b32 v8, v2 offset:65535 ; encoding: [0xff,0xff,0x7a,0xd8,0x02,0x00,0x00,0x08] diff --git a/test/MC/AMDGPU/expressions.s b/test/MC/AMDGPU/expressions.s index 7b0e90378a06..dd4957c8baec 100644 --- a/test/MC/AMDGPU/expressions.s +++ b/test/MC/AMDGPU/expressions.s @@ -46,3 +46,11 @@ BB2: s_sub_u32 vcc_lo, vcc_lo, (BB2+4)-BB1 // VI: s_sub_u32 vcc_lo, vcc_lo, (BB2+4)-BB1 ; encoding: [0x6a,0xff,0xea,0x80,A,A,A,A] // VI-NEXT: ; fixup A - offset: 4, value: (BB2+4)-BB1, kind: FK_Data_4 + +t=1 +s_sub_u32 s0, s0, -t +// VI: s_sub_u32 s0, s0, -1 ; encoding: [0x00,0xc1,0x80,0x80] + +t=-1 +s_sub_u32 s0, s0, -t +// VI: s_sub_u32 s0, s0, 1 ; encoding: [0x00,0x81,0x80,0x80] diff --git a/test/MC/AMDGPU/invalid-instructions-spellcheck.s b/test/MC/AMDGPU/invalid-instructions-spellcheck.s new file mode 100644 index 000000000000..f4198f10f4b8 --- /dev/null +++ b/test/MC/AMDGPU/invalid-instructions-spellcheck.s @@ -0,0 +1,48 @@ +# RUN: not llvm-mc -triple amdgcn < %s 2>&1 | FileCheck %s + +# This tests the mnemonic spell checker. + +# First check what happens when an instruction is omitted: + +v2, v4, v6 + +# CHECK: unknown token in expression +# CHECK-NEXT: v2, v4, v6 +# CHECK-NEXT: ^ + +# CHECK: error: not a valid operand. +# CHECK-NEXT: v2, v4, v6 +# CHECK-NEXT: ^ + +# We don't want to see a suggestion here; the edit distance is too large to +# give sensible suggestions: + +aaaaaaaaaaaaaaa v1, v2, v3 + +# CHECK: error: invalid instruction +# CHECK-NEXT: aaaaaaaaaaaaaaa v1, v2, v3 +# CHECK-NEXT: ^ + +# Check that we get one suggestion: 'dsc_write_src2_b64' is 1 edit away, i.e. an deletion. + +dsc_write_src2_b64 v1, v2, v3 + +# CHECK: error: invalid instruction, did you mean: ds_write_src2_b64? +# CHECK-NEXT: dsc_write_src2_b64 v1, v2, v3 +# CHECK-NEXT: ^ + +# Check edit distance 1 and 2, just insertions: + +s_mov_b v1, v2 + +# CHECK: error: invalid instruction, did you mean: s_mov_b32, s_mov_b64? +# CHECK-NEXT: s_mov_b v1, v2 +# CHECK-NEXT: ^ + +# Check an instruction that is 2 edits away, and also has a lot of candidates: + +s_load_dwordx v1, v2, v3 + +# CHECK: error: invalid instruction, did you mean: s_load_dword, s_load_dwordx16, s_load_dwordx2, s_load_dwordx4, s_load_dwordx8? +# CHECK-NEXT: s_load_dwordx v1, v2, v3 +# CHECK-NEXT: ^ diff --git a/test/MC/AMDGPU/trap.s b/test/MC/AMDGPU/trap.s index de551ca00b52..7b527ba3072e 100644 --- a/test/MC/AMDGPU/trap.s +++ b/test/MC/AMDGPU/trap.s @@ -190,6 +190,48 @@ s_mov_b64 ttmp[14:15], exec // GFX9: s_mov_b64 ttmp[14:15], exec ; encoding: [0x7e,0x01,0xfa,0xbe] //===----------------------------------------------------------------------===// +// Trap Handler related - 8-dword registers +// NB: gfx7 doc states that SMRD does not support trap registers for dst +//===----------------------------------------------------------------------===// + +s_buffer_load_dwordx8 ttmp[0:7], s[0:3], s0 +// VI: [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00] +// GFX9: [0x00,0x1b,0x2c,0xc0,0x00,0x00,0x00,0x00] + +s_buffer_load_dwordx8 ttmp[4:11], s[0:3], s0 +// VI: [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00] +// GFX9: [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00] + +s_buffer_load_dwordx8 ttmp[8:15], s[0:3], s0 +// NOSICIVI: error: not a valid operand +// GFX9: [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00] + +s_load_dwordx8 ttmp[0:7], s[0:1], s0 +// VI: [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00] +// GFX9: [0x00,0x1b,0x0c,0xc0,0x00,0x00,0x00,0x00] + +s_load_dwordx8 ttmp[4:11], s[0:1], s0 +// VI: [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00] +// GFX9: [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00] + +s_load_dwordx8 ttmp[8:15], s[0:1], s0 +// NOSICIVI: error: not a valid operand +// GFX9: [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00] + +//===----------------------------------------------------------------------===// +// Trap Handler related - 16-dword registers +// NB: gfx7 doc states that SMRD does not support trap registers for dst +//===----------------------------------------------------------------------===// + +s_buffer_load_dwordx16 ttmp[0:15], s[0:3], s0 +// NOSICIVI: error: not a valid operand +// GFX9: [0x00,0x1b,0x30,0xc0,0x00,0x00,0x00,0x00] + +s_load_dwordx16 ttmp[0:15], s[0:1], s0 +// NOSICIVI: error: not a valid operand +// GFX9: [0x00,0x1b,0x10,0xc0,0x00,0x00,0x00,0x00] + +//===----------------------------------------------------------------------===// // Trap Handler related - Some specific instructions //===----------------------------------------------------------------------===// diff --git a/test/MC/AMDGPU/vop1-gfx9-err.s b/test/MC/AMDGPU/vop1-gfx9-err.s index 0036b79c71b5..61bf5f661759 100644 --- a/test/MC/AMDGPU/vop1-gfx9-err.s +++ b/test/MC/AMDGPU/vop1-gfx9-err.s @@ -1,6 +1,6 @@ -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s 2>&1 | FileCheck -check-prefixes=GCN,GFX9 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefixes=GCN,VI %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii -show-encoding %s 2>&1 | FileCheck -check-prefixes=GCN,CI %s v_swap_b32 v1, 1 // GCN: :16: error: invalid operand for instruction @@ -10,7 +10,9 @@ v_swap_b32 v1, s0 // FIXME: Better error for it requiring VOP1 encoding v_swap_b32_e64 v1, v2 -// GCN: :1: error: unrecognized instruction mnemonic +// GFX9: :1: error: invalid instruction, did you mean: v_swap_b32? +// CI: :1: error: invalid instruction +// VI: :1: error: invalid instruction v_swap_b32 v1, v2, v1 // GCN: :20: error: invalid operand for instruction @@ -22,4 +24,4 @@ v_swap_b32 v1, v2, v2, v2 // GCN: :20: error: invalid operand for instruction v_swap_codegen_pseudo_b32 v1, v2 -// GCN: :1: error: unrecognized instruction mnemonic +// GCN: :1: error: invalid instruction diff --git a/test/MC/AMDGPU/vop3p-err.s b/test/MC/AMDGPU/vop3p-err.s index 7dfd951b6ad0..fe3fee97b5e7 100644 --- a/test/MC/AMDGPU/vop3p-err.s +++ b/test/MC/AMDGPU/vop3p-err.s @@ -59,16 +59,16 @@ v_pk_add_f16 v1, v2, |v3| // GFX9: :22: error: invalid operand for instruction v_pk_add_f16 v1, v2, abs(v3) -// GFX9: :19: error: invalid operand for instruction +// GFX9: :18: error: invalid operand for instruction v_pk_add_f16 v1, -v2, v3 -// GFX9: :23: error: invalid operand for instruction +// GFX9: :22: error: invalid operand for instruction v_pk_add_f16 v1, v2, -v3 // GFX9: :18: error: invalid operand for instruction v_pk_add_u16 v1, abs(v2), v3 -// GFX9: :19: error: invalid operand for instruction +// GFX9: :18: error: invalid operand for instruction v_pk_add_u16 v1, -v2, v3 // diff --git a/test/MC/ARM/dfb-neg.s b/test/MC/ARM/dfb-neg.s new file mode 100644 index 000000000000..15c44877fa6f --- /dev/null +++ b/test/MC/ARM/dfb-neg.s @@ -0,0 +1,10 @@ +@ RUN: not llvm-mc -triple armv8-none-eabi -mcpu=cortex-r52 -mattr=-dfb -show-encoding < %s 2>&1 | FileCheck %s +@ RUN: not llvm-mc -triple thumbv8-none-eabi -mcpu=cortex-r52 -mattr=-dfb -show-encoding < %s 2>&1 | FileCheck %s + + dfb +@ CHECK: error: instruction requires: full-data-barrier + + dfb sy + dfb #0 +@ CHECK: error: invalid instruction +@ CHECK: error: invalid instruction diff --git a/test/MC/ARM/dfb.s b/test/MC/ARM/dfb.s new file mode 100644 index 000000000000..58477749807f --- /dev/null +++ b/test/MC/ARM/dfb.s @@ -0,0 +1,6 @@ +@ RUN: llvm-mc -triple armv8-none-eabi -mcpu=cortex-r52 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-ARM +@ RUN: llvm-mc -triple thumbv8-none-eabi -mcpu=cortex-r52 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-THUMB + + dfb +@ CHECK-ARM: dfb @ encoding: [0x4c,0xf0,0x7f,0xf5] +@ CHECK-THUMB: dfb @ encoding: [0xbf,0xf3,0x4c,0x8f] diff --git a/test/MC/COFF/align-nops.s b/test/MC/COFF/align-nops.s index 02b488475e90..6d23721ed779 100644 --- a/test/MC/COFF/align-nops.s +++ b/test/MC/COFF/align-nops.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 %s | llvm-readobj -s -sd | FileCheck %s +// RUN: llvm-mc -filetype=obj -triple i686-pc-win32 -mcpu=pentiumpro %s | llvm-readobj -s -sd | FileCheck %s // Test that we get optimal nops in text .text diff --git a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt index a2f9d24091ef..33b8d6b4de44 100644 --- a/test/MC/Disassembler/AArch64/basic-a64-instructions.txt +++ b/test/MC/Disassembler/AArch64/basic-a64-instructions.txt @@ -1,6 +1,7 @@ # RUN: llvm-mc -triple=aarch64 -mattr=+fp-armv8 -disassemble < %s | FileCheck %s # RUN: llvm-mc -triple=arm64 -mattr=+fp-armv8 -disassemble < %s | FileCheck %s # RUN: llvm-mc -triple=arm64 -mattr=+fp-armv8,+fullfp16 -disassemble < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FP16 +# RUN: llvm-mc -triple=arm64 -mattr=+v8.3a -disassemble < %s | FileCheck %s --check-prefix=CHECK-V83 #------------------------------------------------------------------------------ # Add/sub (immediate) @@ -3493,6 +3494,7 @@ # CHECK: mrs x9, {{midr_el1|MIDR_EL1}} # CHECK: mrs x9, {{ccsidr_el1|CCSIDR_EL1}} # CHECK: mrs x9, {{csselr_el1|CSSELR_EL1}} +# CHECK-V83: mrs x9, {{ccsidr2_el1|CCSIDR2_EL1}} # CHECK: mrs x9, {{vpidr_el2|VPIDR_EL2}} # CHECK: mrs x9, {{clidr_el1|CLIDR_EL1}} # CHECK: mrs x9, {{ctr_el0|CTR_EL0}} @@ -4048,6 +4050,7 @@ 0x9 0x0 0x38 0xd5 0x9 0x0 0x39 0xd5 0x9 0x0 0x3a 0xd5 +0x49 0x0 0x39 0xd5 0x9 0x0 0x3c 0xd5 0x29 0x0 0x39 0xd5 0x29 0x0 0x3b 0xd5 diff --git a/test/MC/Disassembler/AMDGPU/ds_vi.txt b/test/MC/Disassembler/AMDGPU/ds_vi.txt index 6d910ea5bb58..c12e7a157e82 100644 --- a/test/MC/Disassembler/AMDGPU/ds_vi.txt +++ b/test/MC/Disassembler/AMDGPU/ds_vi.txt @@ -171,6 +171,9 @@ # VI: ds_swizzle_b32 v8, v2 ; encoding: [0x00,0x00,0x7a,0xd8,0x02,0x00,0x00,0x08] 0x00 0x00 0x7a 0xd8 0x02 0x00 0x00 0x08 +# VI: ds_swizzle_b32 v8, v2 gds ; encoding: [0x00,0x00,0x7b,0xd8,0x02,0x00,0x00,0x08] +0x00 0x00 0x7b 0xd8 0x02 0x00 0x00 0x08 + # VI: ds_read_b32 v8, v2 ; encoding: [0x00,0x00,0x6c,0xd8,0x02,0x00,0x00,0x08] 0x00 0x00 0x6c 0xd8 0x02 0x00 0x00 0x08 diff --git a/test/MC/Disassembler/AMDGPU/trap_gfx9.txt b/test/MC/Disassembler/AMDGPU/trap_gfx9.txt index 70a570eec5ab..0b140c42168a 100644 --- a/test/MC/Disassembler/AMDGPU/trap_gfx9.txt +++ b/test/MC/Disassembler/AMDGPU/trap_gfx9.txt @@ -107,3 +107,35 @@ # GFX9: buffer_atomic_inc v1, off, ttmp[12:15], 56 glc ; encoding: [0x00,0x40,0x2c,0xe1,0x00,0x01,0x1e,0xb8] 0x00,0x40,0x2c,0xe1,0x00,0x01,0x1e,0xb8 + +#===----------------------------------------------------------------------===# +# Trap Handler related - 8-dword registers +#===----------------------------------------------------------------------===# + +# GFX9: s_buffer_load_dwordx8 ttmp[0:7], s[0:3], s0 ; encoding: [0x00,0x1b,0x2c,0xc0,0x00,0x00,0x00,0x00] +0x00,0x1b,0x2c,0xc0,0x00,0x00,0x00,0x00 + +# GFX9: s_buffer_load_dwordx8 ttmp[4:11], s[0:3], s0 ; encoding: [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00] +0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00 + +# GFX9: s_buffer_load_dwordx8 ttmp[8:15], s[0:3], s0 ; encoding: [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00] +0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00 + +# GFX9: s_load_dwordx8 ttmp[0:7], s[0:1], s0 ; encoding: [0x00,0x1b,0x0c,0xc0,0x00,0x00,0x00,0x00] +0x00,0x1b,0x0c,0xc0,0x00,0x00,0x00,0x00 + +# GFX9: s_load_dwordx8 ttmp[4:11], s[0:1], s0 ; encoding: [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00] +0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00 + +# GFX9: s_load_dwordx8 ttmp[8:15], s[0:1], s0 ; encoding: [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00] +0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00 + +#===----------------------------------------------------------------------===# +# Trap Handler related - 16-dword registers +#===----------------------------------------------------------------------===# + +# GFX9: s_buffer_load_dwordx16 ttmp[0:15], s[0:3], s0 ; encoding: [0x00,0x1b,0x30,0xc0,0x00,0x00,0x00,0x00] +0x00,0x1b,0x30,0xc0,0x00,0x00,0x00,0x00 + +# GFX9: s_load_dwordx16 ttmp[0:15], s[0:1], s0 ; encoding: [0x00,0x1b,0x10,0xc0,0x00,0x00,0x00,0x00] +0x00,0x1b,0x10,0xc0,0x00,0x00,0x00,0x00 diff --git a/test/MC/Disassembler/AMDGPU/trap_vi.txt b/test/MC/Disassembler/AMDGPU/trap_vi.txt index 8b131512050c..eb254134cc53 100644 --- a/test/MC/Disassembler/AMDGPU/trap_vi.txt +++ b/test/MC/Disassembler/AMDGPU/trap_vi.txt @@ -107,3 +107,19 @@ # VI: buffer_atomic_inc v1, off, ttmp[8:11], 56 glc ; encoding: [0x00,0x40,0x2c,0xe1,0x00,0x01,0x1e,0xb8] 0x00,0x40,0x2c,0xe1,0x00,0x01,0x1e,0xb8 + +#===----------------------------------------------------------------------===# +# Trap Handler related - 8-dword registers +#===----------------------------------------------------------------------===# + +# VI: s_buffer_load_dwordx8 ttmp[0:7], s[0:3], s0 ; encoding: [0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00] +0x00,0x1c,0x2c,0xc0,0x00,0x00,0x00,0x00 + +# VI: s_buffer_load_dwordx8 ttmp[4:11], s[0:3], s0 ; encoding: [0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00] +0x00,0x1d,0x2c,0xc0,0x00,0x00,0x00,0x00 + +# VI: s_load_dwordx8 ttmp[0:7], s[0:1], s0 ; encoding: [0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00] +0x00,0x1c,0x0c,0xc0,0x00,0x00,0x00,0x00 + +# VI: s_load_dwordx8 ttmp[4:11], s[0:1], s0 ; encoding: [0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00] +0x00,0x1d,0x0c,0xc0,0x00,0x00,0x00,0x00 diff --git a/test/MC/Disassembler/ARM/dfb-arm.txt b/test/MC/Disassembler/ARM/dfb-arm.txt new file mode 100644 index 000000000000..26f81621274c --- /dev/null +++ b/test/MC/Disassembler/ARM/dfb-arm.txt @@ -0,0 +1,6 @@ +# RUN: llvm-mc -disassemble -triple armv8-none-eabi -mcpu=cortex-r52 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DFB +# RUN: llvm-mc -disassemble -triple armv8-none-eabi -mcpu=cortex-r52 -mattr=-dfb -show-encoding < %s | FileCheck %s --check-prefix=CHECK-NODFB + +# CHECK-DFB: dfb @ encoding: [0x4c,0xf0,0x7f,0xf5] +# CHECK-NODFB: dsb #0xc @ encoding: [0x4c,0xf0,0x7f,0xf5] +[0x4c,0xf0,0x7f,0xf5] diff --git a/test/MC/Disassembler/ARM/dfb-thumb.txt b/test/MC/Disassembler/ARM/dfb-thumb.txt new file mode 100644 index 000000000000..aa8adc83c1f4 --- /dev/null +++ b/test/MC/Disassembler/ARM/dfb-thumb.txt @@ -0,0 +1,6 @@ +# RUN: llvm-mc -disassemble -triple thumbv8-none-eabi -mcpu=cortex-r52 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DFB +# RUN: llvm-mc -disassemble -triple thumbv8-none-eabi -mcpu=cortex-r52 -mattr=-dfb -show-encoding < %s | FileCheck %s --check-prefix=CHECK-NODFB + +# CHECK-DFB: dfb @ encoding: [0xbf,0xf3,0x4c,0x8f] +# CHECK-NODFB: dsb #0xc @ encoding: [0xbf,0xf3,0x4c,0x8f] +[0xbf,0xf3,0x4c,0x8f] diff --git a/test/MC/Disassembler/X86/x86-32.txt b/test/MC/Disassembler/X86/x86-32.txt index 4211721ec48b..cc05dfb6f896 100644 --- a/test/MC/Disassembler/X86/x86-32.txt +++ b/test/MC/Disassembler/X86/x86-32.txt @@ -667,6 +667,9 @@ # CHECK: prefetchw (%eax) 0x0f 0x0d 0x08 +# CHECK: prefetchwt1 (%eax) +0x0f 0x0d 0x10 + # CHECK: adcxl %eax, %eax 0x66 0x0f 0x38 0xf6 0xc0 diff --git a/test/MC/ELF/align-nops.s b/test/MC/ELF/align-nops.s index 5e3386823f26..32da3dbd8e82 100644 --- a/test/MC/ELF/align-nops.s +++ b/test/MC/ELF/align-nops.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s -sd | FileCheck %s +// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - | llvm-readobj -s -sd | FileCheck %s // Test that we get optimal nops in text .text diff --git a/test/MC/MachO/x86_32-optimal_nop.s b/test/MC/MachO/x86_32-optimal_nop.s index 01d8a1f6eb2a..1bc9eff337c4 100644 --- a/test/MC/MachO/x86_32-optimal_nop.s +++ b/test/MC/MachO/x86_32-optimal_nop.s @@ -1,4 +1,4 @@ -// RUN: llvm-mc -triple i386-apple-darwin9 %s -filetype=obj -o - | llvm-readobj -file-headers -s -sd -r -t -macho-segment -macho-dysymtab -macho-indirect-symbols | FileCheck %s +// RUN: llvm-mc -triple i386-apple-darwin9 -mcpu=pentiumpro %s -filetype=obj -o - | llvm-readobj -file-headers -s -sd -r -t -macho-segment -macho-dysymtab -macho-indirect-symbols | FileCheck %s # 1 byte nop test .align 4, 0 # start with 16 byte alignment filled with zeros diff --git a/test/MC/Mips/eva/invalid.s b/test/MC/Mips/eva/invalid.s index 5c4cc3ddf631..1f7c2e6ac8c6 100644 --- a/test/MC/Mips/eva/invalid.s +++ b/test/MC/Mips/eva/invalid.s @@ -1,35 +1,36 @@ # Instructions that are invalid # # RUN: not llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips32r2 \ -# RUN: -mattr==eva 2>%t1 +# RUN: -mattr=+eva 2>%t1 # RUN: FileCheck %s < %t1 .set noat - cachee -1, 255($7) # CHECK: :[[@LINE]]:12: error: invalid operand for instruction - cachee 32, 255($7) # CHECK: :[[@LINE]]:12: error: invalid operand for instruction - prefe -1, 255($7) # CHECK: :[[@LINE]]:11: error: invalid operand for instruction - prefe 32, 255($7) # CHECK: :[[@LINE]]:11: error: invalid operand for instruction + cachee -1, 255($7) # CHECK: :[[@LINE]]:12: error: expected 5-bit unsigned immediate + cachee 32, 255($7) # CHECK: :[[@LINE]]:12: error: expected 5-bit unsigned immediate + prefe -1, 255($7) # CHECK: :[[@LINE]]:11: error: expected 5-bit unsigned immediate + prefe 32, 255($7) # CHECK: :[[@LINE]]:11: error: expected 5-bit unsigned immediate lle $33, 8($5) # CHECK: :[[@LINE]]:9: error: invalid operand for instruction - lle $4, 8($33) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction - lle $4, 512($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction - lle $4, -513($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction + lle $4, 8($33) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset + lle $4, 512($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset + lle $4, -513($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset lwe $33, 8($5) # CHECK: :[[@LINE]]:9: error: invalid operand for instruction - lwe $4, 8($33) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction - lwe $4, 512($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction - lwe $4, -513($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction + lwe $4, 8($33) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset + lwe $4, 512($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset + lwe $4, -513($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset sbe $33, 8($5) # CHECK: :[[@LINE]]:9: error: invalid operand for instruction - sbe $4, 8($33) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction - sbe $4, 512($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction - sbe $4, -513($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction + sbe $4, 8($33) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset + sbe $4, 512($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset + sbe $4, -513($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset sce $33, 8($5) # CHECK: :[[@LINE]]:9: error: invalid operand for instruction - sce $4, 8($33) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction - sce $4, 512($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction - sce $4, -513($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction + sce $4, 8($33) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset + sce $4, 512($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset + sce $4, -513($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset she $33, 8($5) # CHECK: :[[@LINE]]:9: error: invalid operand for instruction - she $4, 8($33) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction - she $4, 512($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction - she $4, -513($5) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction + she $4, 8($33) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset + she $4, 512($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset + she $4, -513($5) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset swe $33, 8($4) # CHECK: :[[@LINE]]:9: error: invalid operand for instruction - swe $5, 8($34) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction - swe $5, 512($4) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction - swe $5, -513($4) # CHECK: :[[@LINE]]:13: error: invalid operand for instruction + swe $5, 8($34) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset + swe $5, 512($4) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset + swe $5, -513($4) # CHECK: :[[@LINE]]:13: error: expected memory with 9-bit signed offset + diff --git a/test/MC/WebAssembly/weak-alias.ll b/test/MC/WebAssembly/weak-alias.ll index 5f5d9b447c71..29eeac02da66 100644 --- a/test/MC/WebAssembly/weak-alias.ll +++ b/test/MC/WebAssembly/weak-alias.ll @@ -8,21 +8,41 @@ @bar = global i32 7, align 8 @bar_alias = weak hidden alias i32, i32* @bar -@bar_alias_address = global i32* @bar_alias, align 8 - @foo_alias = weak hidden alias i32 (), i32 ()* @foo +@direct_address = global i32()* @foo, align 8 +@alias_address = global i32()* @foo_alias, align 8 + +define hidden i32 @foo() #0 { +entry: + ret i32 0 +} + +define hidden i32 @call_direct() #0 { +entry: + %call = call i32 @foo() + ret i32 %call +} + define hidden i32 @call_alias() #0 { entry: %call = call i32 @foo_alias() ret i32 %call } -define hidden i32 @foo() #0 { +define hidden i32 @call_direct_ptr() #0 { entry: - ret i32 0 + %0 = load i32 ()*, i32 ()** @direct_address, align 8 + %call = call i32 %0() + ret i32 %call } +define hidden i32 @call_alias_ptr() #0 { +entry: + %0 = load i32 ()*, i32 ()** @alias_address, align 8 + %call = call i32 %0() + ret i32 %call +} ; CHECK: - Type: TYPE ; CHECK-NEXT: Signatures: @@ -42,47 +62,112 @@ entry: ; CHECK-NEXT: Table: ; CHECK-NEXT: ElemType: ANYFUNC ; CHECK-NEXT: Limits: -; CHECK-NEXT: Initial: 0x00000000 +; CHECK-NEXT: Initial: 0x00000002 +; CHECK-NEXT: - Module: env +; CHECK-NEXT: Field: foo_alias +; CHECK-NEXT: Kind: FUNCTION +; CHECK-NEXT: SigIndex: 0 +; CHECK-NEXT: - Module: env +; CHECK-NEXT: Field: bar_alias +; CHECK-NEXT: Kind: GLOBAL +; CHECK-NEXT: GlobalType: I32 +; CHECK-NEXT: GlobalMutable: false ; CHECK-NEXT: - Type: FUNCTION -; CHECK-NEXT: FunctionTypes: [ 0, 0 ] +; CHECK-NEXT: FunctionTypes: [ 0, 0, 0, 0, 0 ] ; CHECK-NEXT: - Type: GLOBAL ; CHECK-NEXT: Globals: ; CHECK-NEXT: - Type: I32 ; CHECK-NEXT: Mutable: false ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 0 +; CHECK-NEXT: Value: 8 ; CHECK-NEXT: - Type: I32 ; CHECK-NEXT: Mutable: false ; CHECK-NEXT: InitExpr: ; CHECK-NEXT: Opcode: I32_CONST -; CHECK-NEXT: Value: 8 +; CHECK-NEXT: Value: 16 +; CHECK-NEXT: - Type: I32 +; CHECK-NEXT: Mutable: false +; CHECK-NEXT: InitExpr: +; CHECK-NEXT: Opcode: I32_CONST +; CHECK-NEXT: Value: 0 ; CHECK-NEXT: - Type: EXPORT ; CHECK-NEXT: Exports: +; CHECK-NEXT: - Name: foo +; CHECK-NEXT: Kind: FUNCTION +; CHECK-NEXT: Index: 1 +; CHECK-NEXT: - Name: call_direct +; CHECK-NEXT: Kind: FUNCTION +; CHECK-NEXT: Index: 2 ; CHECK-NEXT: - Name: call_alias ; CHECK-NEXT: Kind: FUNCTION -; CHECK-NEXT: Index: 0 -; CHECK-NEXT: - Name: foo +; CHECK-NEXT: Index: 3 +; CHECK-NEXT: - Name: call_direct_ptr ; CHECK-NEXT: Kind: FUNCTION +; CHECK-NEXT: Index: 4 +; CHECK-NEXT: - Name: direct_address +; CHECK-NEXT: Kind: GLOBAL ; CHECK-NEXT: Index: 1 -; CHECK-NEXT: - Name: bar +; CHECK-NEXT: - Name: call_alias_ptr +; CHECK-NEXT: Kind: FUNCTION +; CHECK-NEXT: Index: 5 +; CHECK-NEXT: - Name: alias_address ; CHECK-NEXT: Kind: GLOBAL -; CHECK-NEXT: Index: 0 -; CHECK-NEXT: - Name: bar_alias_address +; CHECK-NEXT: Index: 2 +; CHECK-NEXT: - Name: bar ; CHECK-NEXT: Kind: GLOBAL -; CHECK-NEXT: Index: 1 +; CHECK-NEXT: Index: 3 ; CHECK-NEXT: - Name: foo_alias ; CHECK-NEXT: Kind: FUNCTION ; CHECK-NEXT: Index: 1 ; CHECK-NEXT: - Name: bar_alias ; CHECK-NEXT: Kind: GLOBAL +; CHECK-NEXT: Index: 3 +; CHECK-NEXT: - Type: ELEM +; CHECK-NEXT: Segments: +; CHECK-NEXT: - Offset: +; CHECK-NEXT: Opcode: I32_CONST +; CHECK-NEXT: Value: 0 +; CHECK-NEXT: Functions: [ 1, 0 ] +; CHECK-NEXT: - Type: CODE +; CHECK-NEXT: Relocations: +; CHECK-NEXT: - Type: R_WEBASSEMBLY_FUNCTION_INDEX_LEB +; CHECK-NEXT: Index: 1 +; CHECK-NEXT: Offset: 0x00000009 +; CHECK-NEXT: - Type: R_WEBASSEMBLY_FUNCTION_INDEX_LEB ; CHECK-NEXT: Index: 0 - -; CHECK: - Type: DATA +; CHECK-NEXT: Offset: 0x00000012 +; CHECK-NEXT: - Type: R_WEBASSEMBLY_MEMORY_ADDR_LEB +; CHECK-NEXT: Index: 1 +; CHECK-NEXT: Offset: 0x0000001E +; CHECK-NEXT: - Type: R_WEBASSEMBLY_TYPE_INDEX_LEB +; CHECK-NEXT: Index: 0 +; CHECK-NEXT: Offset: 0x00000024 +; CHECK-NEXT: - Type: R_WEBASSEMBLY_MEMORY_ADDR_LEB +; CHECK-NEXT: Index: 2 +; CHECK-NEXT: Offset: 0x00000031 +; CHECK-NEXT: - Type: R_WEBASSEMBLY_TYPE_INDEX_LEB +; CHECK-NEXT: Index: 0 +; CHECK-NEXT: Offset: 0x00000037 +; CHECK-NEXT: Functions: +; CHECK-NEXT: - Locals: +; CHECK-NEXT: Body: 41000B +; CHECK-NEXT: - Locals: +; CHECK-NEXT: Body: 1081808080000B +; CHECK-NEXT: - Locals: +; CHECK-NEXT: Body: 1080808080000B +; CHECK-NEXT: - Locals: +; CHECK-NEXT: Body: 410028028880808000118080808000000B +; CHECK-NEXT: - Locals: +; CHECK-NEXT: Body: 410028029080808000118080808000000B +; CHECK-NEXT: - Type: DATA ; CHECK-NEXT: Relocations: -; CHECK-NEXT: - Type: R_WEBASSEMBLY_MEMORY_ADDR_I32 +; CHECK-NEXT: - Type: R_WEBASSEMBLY_TABLE_INDEX_I32 ; CHECK-NEXT: Index: 0 ; CHECK-NEXT: Offset: 0x0000000F +; CHECK-NEXT: - Type: R_WEBASSEMBLY_TABLE_INDEX_I32 +; CHECK-NEXT: Index: 1 +; CHECK-NEXT: Offset: 0x00000018 ; CHECK-NEXT: Segments: ; CHECK-NEXT: - SectionOffset: 6 ; CHECK-NEXT: MemoryIndex: 0 @@ -101,38 +186,64 @@ entry: ; CHECK-NEXT: Name: name ; CHECK-NEXT: FunctionNames: ; CHECK-NEXT: - Index: 0 -; CHECK-NEXT: Name: call_alias +; CHECK-NEXT: Name: foo_alias ; CHECK-NEXT: - Index: 1 ; CHECK-NEXT: Name: foo +; CHECK-NEXT: - Index: 2 +; CHECK-NEXT: Name: call_direct +; CHECK-NEXT: - Index: 3 +; CHECK-NEXT: Name: call_alias +; CHECK-NEXT: - Index: 4 +; CHECK-NEXT: Name: call_direct_ptr +; CHECK-NEXT: - Index: 5 +; CHECK-NEXT: Name: call_alias_ptr ; CHECK-NEXT: - Type: CUSTOM ; CHECK-NEXT: Name: linking -; CHECK-NEXT: DataSize: 12 +; CHECK-NEXT: DataSize: 20 ; CHECK-NEXT: SymbolInfo: -; CHECK-NEXT: - Name: call_alias -; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ] -; CHECK-NEXT: - Name: foo -; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ] ; CHECK-NEXT: - Name: foo_alias ; CHECK-NEXT: Flags: [ BINDING_WEAK, VISIBILITY_HIDDEN ] ; CHECK-NEXT: - Name: bar_alias ; CHECK-NEXT: Flags: [ BINDING_WEAK, VISIBILITY_HIDDEN ] +; CHECK-NEXT: - Name: foo +; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ] +; CHECK-NEXT: - Name: call_direct +; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ] +; CHECK-NEXT: - Name: call_alias +; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ] +; CHECK-NEXT: - Name: call_direct_ptr +; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ] +; CHECK-NEXT: - Name: call_alias_ptr +; CHECK-NEXT: Flags: [ VISIBILITY_HIDDEN ] ; CHECK-NEXT: SegmentInfo: ; CHECK-NEXT: - Index: 0 ; CHECK-NEXT: Name: .data.bar ; CHECK-NEXT: Alignment: 8 ; CHECK-NEXT: Flags: [ ] ; CHECK-NEXT: - Index: 1 -; CHECK-NEXT: Name: .data.bar_alias_address +; CHECK-NEXT: Name: .data.direct_address +; CHECK-NEXT: Alignment: 8 +; CHECK-NEXT: Flags: [ ] +; CHECK-NEXT: - Index: 2 +; CHECK-NEXT: Name: .data.alias_address ; CHECK-NEXT: Alignment: 8 ; CHECK-NEXT: Flags: [ ] ; CHECK-NEXT: ... ; CHECK-SYMS: SYMBOL TABLE: -; CHECK-SYMS-NEXT: 00000000 g F name call_alias +; CHECK-SYMS-NEXT: 00000000 g F name foo_alias ; CHECK-SYMS-NEXT: 00000001 g F name foo -; CHECK-SYMS-NEXT: 00000000 g F EXPORT .hidden call_alias -; CHECK-SYMS-NEXT: 00000001 g F EXPORT .hidden foo -; CHECK-SYMS-NEXT: 00000000 g EXPORT bar -; CHECK-SYMS-NEXT: 00000008 g EXPORT bar_alias_address +; CHECK-SYMS-NEXT: 00000002 g F name call_direct +; CHECK-SYMS-NEXT: 00000003 g F name call_alias +; CHECK-SYMS-NEXT: 00000004 g F name call_direct_ptr +; CHECK-SYMS-NEXT: 00000005 g F name call_alias_ptr ; CHECK-SYMS-NEXT: 00000001 gw F EXPORT .hidden foo_alias ; CHECK-SYMS-NEXT: 00000000 gw EXPORT .hidden bar_alias +; CHECK-SYMS-NEXT: 00000001 g F EXPORT .hidden foo +; CHECK-SYMS-NEXT: 00000002 g F EXPORT .hidden call_direct +; CHECK-SYMS-NEXT: 00000003 g F EXPORT .hidden call_alias +; CHECK-SYMS-NEXT: 00000004 g F EXPORT .hidden call_direct_ptr +; CHECK-SYMS-NEXT: 00000008 g EXPORT direct_address +; CHECK-SYMS-NEXT: 00000005 g F EXPORT .hidden call_alias_ptr +; CHECK-SYMS-NEXT: 00000010 g EXPORT alias_address +; CHECK-SYMS-NEXT: 00000000 g EXPORT bar diff --git a/test/MC/X86/3DNow.s b/test/MC/X86/3DNow.s index 871857b155d0..e66e39b547ae 100644 --- a/test/MC/X86/3DNow.s +++ b/test/MC/X86/3DNow.s @@ -72,8 +72,10 @@ femms // CHECK: prefetch (%rax) # encoding: [0x0f,0x0d,0x00] // CHECK: prefetchw (%rax) # encoding: [0x0f,0x0d,0x08] +// CHECK: prefetchwt1 (%rax) # encoding: [0x0f,0x0d,0x10] prefetch (%rax) prefetchw (%rax) +prefetchwt1 (%rax) // CHECK: pf2iw %mm2, %mm1 # encoding: [0x0f,0x0f,0xca,0x1c] diff --git a/test/MC/X86/AlignedBundling/different-sections.s b/test/MC/X86/AlignedBundling/different-sections.s index e12153210042..0af4a8f133e6 100644 --- a/test/MC/X86/AlignedBundling/different-sections.s +++ b/test/MC/X86/AlignedBundling/different-sections.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \ +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s -# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \ +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s # Test two different executable sections with bundling. diff --git a/test/MC/X86/AlignedBundling/long-nop-pad.s b/test/MC/X86/AlignedBundling/long-nop-pad.s index 36e4f4b553fb..bcc319456d35 100644 --- a/test/MC/X86/AlignedBundling/long-nop-pad.s +++ b/test/MC/X86/AlignedBundling/long-nop-pad.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \ +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s -# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \ +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s # Test that long nops are generated for padding where possible. diff --git a/test/MC/X86/AlignedBundling/misaligned-bundle-group.s b/test/MC/X86/AlignedBundling/misaligned-bundle-group.s index 04b3374716bb..edd933ed720d 100644 --- a/test/MC/X86/AlignedBundling/misaligned-bundle-group.s +++ b/test/MC/X86/AlignedBundling/misaligned-bundle-group.s @@ -1,7 +1,7 @@ -# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu %s -o - \ +# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - \ # RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s -# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mc-relax-all %s -o - \ +# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - \ # RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-RELAX %s diff --git a/test/MC/X86/AlignedBundling/misaligned-bundle.s b/test/MC/X86/AlignedBundling/misaligned-bundle.s index 08d616109909..676b667cada5 100644 --- a/test/MC/X86/AlignedBundling/misaligned-bundle.s +++ b/test/MC/X86/AlignedBundling/misaligned-bundle.s @@ -1,7 +1,7 @@ -# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu %s -o - \ +# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - \ # RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s -# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mc-relax-all %s -o - \ +# RUN: llvm-mc -filetype=obj -triple i686-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - \ # RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK-RELAX %s diff --git a/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s b/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s index 158cde8cd450..8605b7c0137d 100644 --- a/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s +++ b/test/MC/X86/AlignedBundling/pad-align-to-bundle-end.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \ +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s -# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \ +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s # Test some variations of padding to the end of a bundle. diff --git a/test/MC/X86/AlignedBundling/pad-bundle-groups.s b/test/MC/X86/AlignedBundling/pad-bundle-groups.s index 7a9e30c053ec..5993d73dd6dd 100644 --- a/test/MC/X86/AlignedBundling/pad-bundle-groups.s +++ b/test/MC/X86/AlignedBundling/pad-bundle-groups.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \ +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s -# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \ +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s # Test some variations of padding for bundle-locked groups. diff --git a/test/MC/X86/AlignedBundling/relax-in-bundle-group.s b/test/MC/X86/AlignedBundling/relax-in-bundle-group.s index d07619063f00..036249b906b4 100644 --- a/test/MC/X86/AlignedBundling/relax-in-bundle-group.s +++ b/test/MC/X86/AlignedBundling/relax-in-bundle-group.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \ +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \ # RUN: | llvm-objdump -disassemble - | FileCheck %s -# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \ +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \ # RUN: | llvm-objdump -disassemble - | FileCheck %s # Test that instructions inside bundle-locked groups are relaxed even if their diff --git a/test/MC/X86/AlignedBundling/single-inst-bundling.s b/test/MC/X86/AlignedBundling/single-inst-bundling.s index a7df2c96a8eb..cb0ad8adba39 100644 --- a/test/MC/X86/AlignedBundling/single-inst-bundling.s +++ b/test/MC/X86/AlignedBundling/single-inst-bundling.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \ +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck -check-prefix=CHECK -check-prefix=CHECK-OPT %s -# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mc-relax-all %s -o - \ +# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu -mcpu=pentiumpro -mc-relax-all %s -o - \ # RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck -check-prefix=CHECK -check-prefix=CHECK-RELAX %s # Test simple NOP insertion for single instructions. diff --git a/test/MC/X86/CLFLUSHOPT-32.s b/test/MC/X86/CLFLUSHOPT-32.s new file mode 100644 index 000000000000..e3df46ced7b0 --- /dev/null +++ b/test/MC/X86/CLFLUSHOPT-32.s @@ -0,0 +1,26 @@ +// RUN: llvm-mc -triple i386-unknown-unknown --show-encoding %s | FileCheck %s + +// CHECK: clflushopt -485498096(%edx,%eax,4) +// CHECK: encoding: [0x66,0x0f,0xae,0xbc,0x82,0x10,0xe3,0x0f,0xe3] +clflushopt -485498096(%edx,%eax,4) + +// CHECK: clflushopt 485498096(%edx,%eax,4) +// CHECK: encoding: [0x66,0x0f,0xae,0xbc,0x82,0xf0,0x1c,0xf0,0x1c] +clflushopt 485498096(%edx,%eax,4) + +// CHECK: clflushopt 485498096(%edx) +// CHECK: encoding: [0x66,0x0f,0xae,0xba,0xf0,0x1c,0xf0,0x1c] +clflushopt 485498096(%edx) + +// CHECK: clflushopt 485498096 +// CHECK: encoding: [0x66,0x0f,0xae,0x3d,0xf0,0x1c,0xf0,0x1c] +clflushopt 485498096 + +// CHECK: clflushopt 64(%edx,%eax) +// CHECK: encoding: [0x66,0x0f,0xae,0x7c,0x02,0x40] +clflushopt 64(%edx,%eax) + +// CHECK: clflushopt (%edx) +// CHECK: encoding: [0x66,0x0f,0xae,0x3a] +clflushopt (%edx) + diff --git a/test/MC/X86/CLFLUSHOPT-64.s b/test/MC/X86/CLFLUSHOPT-64.s new file mode 100644 index 000000000000..cdecced6fa2e --- /dev/null +++ b/test/MC/X86/CLFLUSHOPT-64.s @@ -0,0 +1,26 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s + +// CHECK: clflushopt 485498096 +// CHECK: encoding: [0x66,0x0f,0xae,0x3c,0x25,0xf0,0x1c,0xf0,0x1c] +clflushopt 485498096 + +// CHECK: clflushopt 64(%rdx) +// CHECK: encoding: [0x66,0x0f,0xae,0x7a,0x40] +clflushopt 64(%rdx) + +// CHECK: clflushopt 64(%rdx,%rax,4) +// CHECK: encoding: [0x66,0x0f,0xae,0x7c,0x82,0x40] +clflushopt 64(%rdx,%rax,4) + +// CHECK: clflushopt -64(%rdx,%rax,4) +// CHECK: encoding: [0x66,0x0f,0xae,0x7c,0x82,0xc0] +clflushopt -64(%rdx,%rax,4) + +// CHECK: clflushopt 64(%rdx,%rax) +// CHECK: encoding: [0x66,0x0f,0xae,0x7c,0x02,0x40] +clflushopt 64(%rdx,%rax) + +// CHECK: clflushopt (%rdx) +// CHECK: encoding: [0x66,0x0f,0xae,0x3a] +clflushopt (%rdx) + diff --git a/test/MC/X86/CLFSH-32.s b/test/MC/X86/CLFSH-32.s new file mode 100644 index 000000000000..898569ec2df6 --- /dev/null +++ b/test/MC/X86/CLFSH-32.s @@ -0,0 +1,26 @@ +// RUN: llvm-mc -triple i386-unknown-unknown --show-encoding %s | FileCheck %s + +// CHECK: clflush -485498096(%edx,%eax,4) +// CHECK: encoding: [0x0f,0xae,0xbc,0x82,0x10,0xe3,0x0f,0xe3] +clflush -485498096(%edx,%eax,4) + +// CHECK: clflush 485498096(%edx,%eax,4) +// CHECK: encoding: [0x0f,0xae,0xbc,0x82,0xf0,0x1c,0xf0,0x1c] +clflush 485498096(%edx,%eax,4) + +// CHECK: clflush 485498096(%edx) +// CHECK: encoding: [0x0f,0xae,0xba,0xf0,0x1c,0xf0,0x1c] +clflush 485498096(%edx) + +// CHECK: clflush 485498096 +// CHECK: encoding: [0x0f,0xae,0x3d,0xf0,0x1c,0xf0,0x1c] +clflush 485498096 + +// CHECK: clflush 64(%edx,%eax) +// CHECK: encoding: [0x0f,0xae,0x7c,0x02,0x40] +clflush 64(%edx,%eax) + +// CHECK: clflush (%edx) +// CHECK: encoding: [0x0f,0xae,0x3a] +clflush (%edx) + diff --git a/test/MC/X86/CLFSH-64.s b/test/MC/X86/CLFSH-64.s new file mode 100644 index 000000000000..f2c2ae51c81b --- /dev/null +++ b/test/MC/X86/CLFSH-64.s @@ -0,0 +1,26 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s + +// CHECK: clflush 485498096 +// CHECK: encoding: [0x0f,0xae,0x3c,0x25,0xf0,0x1c,0xf0,0x1c] +clflush 485498096 + +// CHECK: clflush 64(%rdx) +// CHECK: encoding: [0x0f,0xae,0x7a,0x40] +clflush 64(%rdx) + +// CHECK: clflush 64(%rdx,%rax,4) +// CHECK: encoding: [0x0f,0xae,0x7c,0x82,0x40] +clflush 64(%rdx,%rax,4) + +// CHECK: clflush -64(%rdx,%rax,4) +// CHECK: encoding: [0x0f,0xae,0x7c,0x82,0xc0] +clflush -64(%rdx,%rax,4) + +// CHECK: clflush 64(%rdx,%rax) +// CHECK: encoding: [0x0f,0xae,0x7c,0x02,0x40] +clflush 64(%rdx,%rax) + +// CHECK: clflush (%rdx) +// CHECK: encoding: [0x0f,0xae,0x3a] +clflush (%rdx) + diff --git a/test/MC/X86/x86_long_nop.s b/test/MC/X86/x86_long_nop.s index fa8525b44c22..7fa553692ea0 100644 --- a/test/MC/X86/x86_long_nop.s +++ b/test/MC/X86/x86_long_nop.s @@ -1,7 +1,7 @@ -# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s -# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s -# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s -# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s +# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s +# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=pentiumpro | llvm-objdump -d -no-show-raw-insn - | FileCheck %s +# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-apple-darwin10.0 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s +# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-apple-darwin8 -mcpu=pentiumpro %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=slm %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=silvermont %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=LNOP7 %s # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=lakemont %s | llvm-objdump -d -no-show-raw-insn - | FileCheck --check-prefix=NOP1 %s diff --git a/test/TableGen/GlobalISelEmitter.td b/test/TableGen/GlobalISelEmitter.td index f9fe51f84380..bc0b509622c4 100644 --- a/test/TableGen/GlobalISelEmitter.td +++ b/test/TableGen/GlobalISelEmitter.td @@ -63,11 +63,14 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; } // CHECK-NEXT: typedef ComplexRendererFns(MyTargetInstructionSelector::*ComplexMatcherMemFn)(MachineOperand &) const; // CHECK-NEXT: const MatcherInfoTy<PredicateBitset, ComplexMatcherMemFn> MatcherInfo; // CHECK-NEXT: static MyTargetInstructionSelector::ComplexMatcherMemFn ComplexPredicateFns[]; +// CHECK-NEXT: bool testImmPredicate_I64(unsigned PredicateID, int64_t Imm) const override; +// CHECK-NEXT: bool testImmPredicate_APInt(unsigned PredicateID, const APInt &Imm) const override; +// CHECK-NEXT: bool testImmPredicate_APFloat(unsigned PredicateID, const APFloat &Imm) const override; // CHECK-NEXT: #endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL // CHECK-LABEL: #ifdef GET_GLOBALISEL_TEMPORARIES_INIT // CHECK-NEXT: , State(2), -// CHECK-NEXT: MatcherInfo({TypeObjects, FeatureBitsets, I64ImmPredicateFns, APIntImmPredicateFns, APFloatImmPredicateFns, ComplexPredicateFns}) +// CHECK-NEXT: MatcherInfo({TypeObjects, FeatureBitsets, ComplexPredicateFns}) // CHECK-NEXT: #endif // ifdef GET_GLOBALISEL_TEMPORARIES_INIT // CHECK-LABEL: enum SubtargetFeatureBits : uint8_t { @@ -127,31 +130,49 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; } // CHECK-NEXT: enum { // CHECK-NEXT: GIPFP_I64_Predicate_simm8 = GIPFP_I64_Invalid + 1, // CHECK-NEXT: }; -// CHECK-NEXT: static bool Predicate_simm8(int64_t Imm) { return isInt<8>(Imm); } -// CHECK-NEXT: static InstructionSelector::I64ImmediatePredicateFn I64ImmPredicateFns[] = { -// CHECK-NEXT: nullptr, -// CHECK-NEXT: Predicate_simm8, -// CHECK-NEXT: }; +// CHECK-NEXT: bool MyTargetInstructionSelector::testImmPredicate_I64(unsigned PredicateID, int64_t Imm) const { +// CHECK-NEXT: switch (PredicateID) { +// CHECK-NEXT: case GIPFP_I64_Predicate_simm8: { +// CHECK-NEXT: return isInt<8>(Imm); +// CHECK-NEXT: llvm_unreachable("ImmediateCode should have returned"); +// CHECK-NEXT: return false; +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: llvm_unreachable("Unknown predicate"); +// CHECK-NEXT: return false; +// CHECK-NEXT: } // CHECK-LABEL: // PatFrag predicates. // CHECK-NEXT: enum { // CHECK-NEXT: GIPFP_APFloat_Predicate_fpimmz = GIPFP_APFloat_Invalid + 1, // CHECK-NEXT: }; -// CHECK-NEXT: static bool Predicate_fpimmz(const APFloat & Imm) { return Imm->isExactlyValue(0.0); } -// CHECK-NEXT: static InstructionSelector::APFloatImmediatePredicateFn APFloatImmPredicateFns[] = { -// CHECK-NEXT: nullptr, -// CHECK-NEXT: Predicate_fpimmz, -// CHECK-NEXT: }; +// CHECK-NEXT: bool MyTargetInstructionSelector::testImmPredicate_APFloat(unsigned PredicateID, const APFloat & Imm) const { +// CHECK-NEXT: switch (PredicateID) { +// CHECK-NEXT: case GIPFP_APFloat_Predicate_fpimmz: { +// CHECK-NEXT: return Imm->isExactlyValue(0.0); +// CHECK-NEXT: llvm_unreachable("ImmediateCode should have returned"); +// CHECK-NEXT: return false; +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: llvm_unreachable("Unknown predicate"); +// CHECK-NEXT: return false; +// CHECK-NEXT: } // CHECK-LABEL: // PatFrag predicates. // CHECK-NEXT: enum { // CHECK-NEXT: GIPFP_APInt_Predicate_simm9 = GIPFP_APInt_Invalid + 1, // CHECK-NEXT: }; -// CHECK-NEXT: static bool Predicate_simm9(const APInt & Imm) { return isInt<9>(Imm->getSExtValue()); } -// CHECK-NEXT: static InstructionSelector::APIntImmediatePredicateFn APIntImmPredicateFns[] = { -// CHECK-NEXT: nullptr, -// CHECK-NEXT: Predicate_simm9, -// CHECK-NEXT: }; +// CHECK-NEXT: bool MyTargetInstructionSelector::testImmPredicate_APInt(unsigned PredicateID, const APInt & Imm) const { +// CHECK-NEXT: switch (PredicateID) { +// CHECK-NEXT: case GIPFP_APInt_Predicate_simm9: { +// CHECK-NEXT: return isInt<9>(Imm->getSExtValue()); +// CHECK-NEXT: llvm_unreachable("ImmediateCode should have returned"); +// CHECK-NEXT: return false; +// CHECK-NEXT: } +// CHECK-NEXT: } +// CHECK-NEXT: llvm_unreachable("Unknown predicate"); +// CHECK-NEXT: return false; +// CHECK-NEXT: } // CHECK-LABEL: MyTargetInstructionSelector::ComplexMatcherMemFn // CHECK-NEXT: MyTargetInstructionSelector::ComplexPredicateFns[] = { diff --git a/test/TableGen/intrinsic-long-name.td b/test/TableGen/intrinsic-long-name.td index 24ed89ac4acf..9bbfe9829a65 100644 --- a/test/TableGen/intrinsic-long-name.td +++ b/test/TableGen/intrinsic-long-name.td @@ -2,6 +2,7 @@ // XFAIL: vg_leak class IntrinsicProperty; +class SDNodeProperty; class ValueType<int size, int value> { string Namespace = "MVT"; @@ -20,6 +21,7 @@ class Intrinsic<string name, list<LLVMType> param_types = []> { list<LLVMType> RetTypes = []; list<LLVMType> ParamTypes = param_types; list<IntrinsicProperty> IntrProperties = []; + list<SDNodeProperty> Properties = []; } def iAny : ValueType<0, 253>; diff --git a/test/TableGen/intrinsic-struct.td b/test/TableGen/intrinsic-struct.td index 93737b14db2a..1f1a8c2c8220 100644 --- a/test/TableGen/intrinsic-struct.td +++ b/test/TableGen/intrinsic-struct.td @@ -2,6 +2,7 @@ // XFAIL: vg_leak class IntrinsicProperty; +class SDNodeProperty; class ValueType<int size, int value> { string Namespace = "MVT"; @@ -20,6 +21,7 @@ class Intrinsic<string name, list<LLVMType> ret_types = []> { list<LLVMType> RetTypes = ret_types; list<LLVMType> ParamTypes = []; list<IntrinsicProperty> IntrProperties = []; + list<SDNodeProperty> Properties = []; } def iAny : ValueType<0, 253>; diff --git a/test/TableGen/intrinsic-varargs.td b/test/TableGen/intrinsic-varargs.td index 4b2cc5ae02f7..484364779012 100644 --- a/test/TableGen/intrinsic-varargs.td +++ b/test/TableGen/intrinsic-varargs.td @@ -2,6 +2,7 @@ // XFAIL: vg_leak class IntrinsicProperty; +class SDNodeProperty; class ValueType<int size, int value> { string Namespace = "MVT"; @@ -20,6 +21,7 @@ class Intrinsic<string name, list<LLVMType> param_types = []> { list<LLVMType> RetTypes = []; list<LLVMType> ParamTypes = param_types; list<IntrinsicProperty> IntrProperties = []; + list<SDNodeProperty> Properties = []; } // isVoid needs to match the definition in ValueTypes.td diff --git a/test/ThinLTO/X86/cache.ll b/test/ThinLTO/X86/cache.ll index ea5c2f98d876..75466442d786 100644 --- a/test/ThinLTO/X86/cache.ll +++ b/test/ThinLTO/X86/cache.ll @@ -5,7 +5,7 @@ ; Verify that enabling caching is ignoring module without hash ; RUN: rm -Rf %t.cache && mkdir %t.cache -; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache +; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache ; RUN: ls %t.cache/llvmcache.timestamp ; RUN: ls %t.cache | count 1 @@ -27,7 +27,7 @@ ; files matching the pattern "llvmcache-*". ; RUN: rm -Rf %t.cache && mkdir %t.cache ; RUN: touch -t 197001011200 %t.cache/llvmcache-foo %t.cache/foo -; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache +; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache ; RUN: ls %t.cache | count 4 ; RUN: ls %t.cache/llvmcache.timestamp ; RUN: ls %t.cache/foo @@ -36,13 +36,29 @@ ; Verify that enabling caching is working with llvm-lto2 ; RUN: rm -Rf %t.cache -; RUN: llvm-lto2 run -o %t.o %t2.bc %t.bc -cache-dir %t.cache \ +; RUN: llvm-lto2 run -o %t.o %t2.bc %t.bc -cache-dir %t.cache \ ; RUN: -r=%t2.bc,_main,plx \ ; RUN: -r=%t2.bc,_globalfunc,lx \ ; RUN: -r=%t.bc,_globalfunc,plx ; RUN: ls %t.cache | count 2 ; RUN: ls %t.cache/llvmcache-* | count 2 +; Verify that caches with a timestamp older than the pruning interval +; will be pruned +; RUN: rm -Rf %t.cache && mkdir %t.cache +; RUN: touch -t 197001011200 %t.cache/llvmcache-foo +; RUN: touch -t 197001011200 %t.cache/llvmcache.timestamp +; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache +; RUN: not ls %t.cache/llvmcache-foo + +; Verify that specifying a negative number for the pruning interval +; effectively disables the pruning +; RUN: rm -Rf %t.cache && mkdir %t.cache +; RUN: touch -t 197001011200 %t.cache/llvmcache-foo +; RUN: touch -t 197001011200 %t.cache/llvmcache.timestamp +; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc %t.bc -thinlto-cache-dir %t.cache --thinlto-cache-pruning-interval -1 +; RUN: ls %t.cache/llvmcache-foo + target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.11.0" diff --git a/test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll b/test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll new file mode 100644 index 000000000000..e10b04a850af --- /dev/null +++ b/test/Transforms/CallSiteSplitting/callsite-no-or-structure.ll @@ -0,0 +1,139 @@ +; RUN: opt < %s -callsite-splitting -S | FileCheck %s +; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s + +; CHECK-LABEL: @test_simple +; CHECK-LABEL: Header: +; CHECK-NEXT: br i1 undef, label %Tail.predBB1.split +; CHECK-LABEL: TBB: +; CHECK: br i1 %cmp, label %Tail.predBB2.split +; CHECK-LABEL: Tail.predBB1.split: +; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* %a, i32 %v, i32 %p) +; CHECK-LABEL: Tail.predBB2.split: +; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 %p) +; CHECK-LABEL: Tail +; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +; CHECK: ret i32 %[[MERGED]] +define i32 @test_simple(i32* %a, i32 %v, i32 %p) { +Header: + br i1 undef, label %Tail, label %End + +TBB: + %cmp = icmp eq i32* %a, null + br i1 %cmp, label %Tail, label %End + +Tail: + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +; CHECK-LABEL: @test_eq_eq_eq_untaken +; CHECK-LABEL: Header: +; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB1.split +; CHECK-LABEL: TBB2: +; CHECK: br i1 %cmp2, label %Tail.predBB2.split, label %End +; CHECK-LABEL: Tail.predBB1.split: +; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p) +; CHECK-LABEL: Tail.predBB2.split: +; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 1, i32 99) +; CHECK-LABEL: Tail +; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +; CHECK: ret i32 %[[MERGED]] +define i32 @test_eq_eq_eq_untaken2(i32* %a, i32 %v, i32 %p) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %TBB1, label %Tail + +TBB1: + %cmp1 = icmp eq i32 %v, 1 + br i1 %cmp1, label %TBB2, label %End + +TBB2: + %cmp2 = icmp eq i32 %p, 99 + br i1 %cmp2, label %Tail, label %End + +Tail: + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +; CHECK-LABEL: @test_eq_ne_eq_untaken +; CHECK-LABEL: Header: +; CHECK: br i1 %tobool1, label %TBB1, label %Tail.predBB1.split +; CHECK-LABEL: TBB2: +; CHECK: br i1 %cmp2, label %Tail.predBB2.split, label %End +; CHECK-LABEL: Tail.predBB1.split: +; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 %p) +; CHECK-LABEL: Tail.predBB2.split: +; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* null, i32 %v, i32 99) +; CHECK-LABEL: Tail +; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +; CHECK: ret i32 %[[MERGED]] +define i32 @test_eq_ne_eq_untaken(i32* %a, i32 %v, i32 %p) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %TBB1, label %Tail + +TBB1: + %cmp1 = icmp ne i32 %v, 1 + br i1 %cmp1, label %TBB2, label %End + +TBB2: + %cmp2 = icmp eq i32 %p, 99 + br i1 %cmp2, label %Tail, label %End + +Tail: + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +; CHECK-LABEL: @test_header_header2_tbb +; CHECK: Header2: +; CHECK:br i1 %tobool2, label %Tail.predBB1.split, label %TBB1 +; CHECK-LABEL: TBB2: +; CHECK: br i1 %cmp2, label %Tail.predBB2.split, label %End +; CHECK-LABEL: Tail.predBB1.split: +; CHECK: %[[CALL1:.*]] = call i32 @callee(i32* nonnull %a, i32 %v, i32 10) +; CHECK-LABEL: Tail.predBB2.split: +; NOTE: CallSiteSplitting cannot infer that %a is null here, as it currently +; only supports recording conditions along a single predecessor path. +; CHECK: %[[CALL2:.*]] = call i32 @callee(i32* %a, i32 1, i32 99) +; CHECK-LABEL: Tail +; CHECK: %[[MERGED:.*]] = phi i32 [ %[[CALL1]], %Tail.predBB1.split ], [ %[[CALL2]], %Tail.predBB2.split ] +; CHECK: ret i32 %[[MERGED]] +define i32 @test_header_header2_tbb(i32* %a, i32 %v, i32 %p) { +Header: + %tobool1 = icmp eq i32* %a, null + br i1 %tobool1, label %TBB1, label %Header2 + +Header2: + %tobool2 = icmp eq i32 %p, 10 + br i1 %tobool2, label %Tail, label %TBB1 + +TBB1: + %cmp1 = icmp eq i32 %v, 1 + br i1 %cmp1, label %TBB2, label %End + +TBB2: + %cmp2 = icmp eq i32 %p, 99 + br i1 %cmp2, label %Tail, label %End + +Tail: + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r + +End: + ret i32 %v +} + +define i32 @callee(i32* %a, i32 %v, i32 %p) { + ret i32 10 +} diff --git a/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll b/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll new file mode 100644 index 000000000000..ca41bd6fc5e1 --- /dev/null +++ b/test/Transforms/CallSiteSplitting/callsite-no-splitting.ll @@ -0,0 +1,18 @@ +; RUN: opt < %s -callsite-splitting -S | FileCheck %s +; RUN: opt < %s -passes='function(callsite-splitting)' -S | FileCheck %s + +define i32 @callee(i32*, i32, i32) { + ret i32 10 +} + +; CHECK-LABEL: @test_preds_equal +; CHECK-NOT: split +; CHECK: br i1 %cmp, label %Tail, label %Tail +define i32 @test_preds_equal(i32* %a, i32 %v, i32 %p) { +TBB: + %cmp = icmp eq i32* %a, null + br i1 %cmp, label %Tail, label %Tail +Tail: + %r = call i32 @callee(i32* %a, i32 %v, i32 %p) + ret i32 %r +} diff --git a/test/Transforms/CodeGenPrepare/section.ll b/test/Transforms/CodeGenPrepare/section.ll index 4f3144e7fc73..30598ba7afbe 100644 --- a/test/Transforms/CodeGenPrepare/section.ll +++ b/test/Transforms/CodeGenPrepare/section.ll @@ -4,33 +4,59 @@ target triple = "x86_64-pc-linux-gnu" ; This tests that hot/cold functions get correct section prefix assigned -; CHECK: hot_func{{.*}}!section_prefix ![[HOT_ID:[0-9]+]] +; CHECK: hot_func1{{.*}}!section_prefix ![[HOT_ID:[0-9]+]] ; The entry is hot -define void @hot_func() !prof !15 { +define void @hot_func1() !prof !15 { ret void } -; For instrumentation based PGO, we should only look at entry counts, +; CHECK: hot_func2{{.*}}!section_prefix ![[HOT_ID:[0-9]+]] +; Entry is cold but inner block is hot +define void @hot_func2(i32 %n) !prof !16 { +entry: + %n.addr = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 %n, i32* %n.addr, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %n.addr, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.end, !prof !19 + +for.body: + %2 = load i32, i32* %i, align 4 + %inc = add nsw i32 %2, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: + ret void +} + +; For instrumentation based PGO, we should only look at block counts, ; not call site VP metadata (which can exist on value profiled memcpy, ; or possibly left behind after static analysis based devirtualization). ; CHECK: cold_func1{{.*}}!section_prefix ![[COLD_ID:[0-9]+]] define void @cold_func1() !prof !16 { - call void @hot_func(), !prof !17 - call void @hot_func(), !prof !17 + call void @hot_func1(), !prof !17 + call void @hot_func1(), !prof !17 ret void } -; CHECK: cold_func2{{.*}}!section_prefix +; CHECK: cold_func2{{.*}}!section_prefix ![[COLD_ID]] define void @cold_func2() !prof !16 { - call void @hot_func(), !prof !17 - call void @hot_func(), !prof !18 - call void @hot_func(), !prof !18 + call void @hot_func1(), !prof !17 + call void @hot_func1(), !prof !18 + call void @hot_func1(), !prof !18 ret void } ; CHECK: cold_func3{{.*}}!section_prefix ![[COLD_ID]] define void @cold_func3() !prof !16 { - call void @hot_func(), !prof !18 + call void @hot_func1(), !prof !18 ret void } @@ -55,3 +81,4 @@ define void @cold_func3() !prof !16 { !16 = !{!"function_entry_count", i64 1} !17 = !{!"branch_weights", i32 80} !18 = !{!"branch_weights", i32 1} +!19 = !{!"branch_weights", i32 1000, i32 1} diff --git a/test/Transforms/GVN/tbaa.ll b/test/Transforms/GVN/tbaa.ll index 7c05fda6cb8f..5cb4e0359970 100644 --- a/test/Transforms/GVN/tbaa.ll +++ b/test/Transforms/GVN/tbaa.ll @@ -1,7 +1,7 @@ ; RUN: opt -tbaa -basicaa -gvn -S < %s | FileCheck %s define i32 @test1(i8* %p, i8* %q) { -; CHECK: @test1(i8* %p, i8* %q) +; CHECK-LABEL: @test1(i8* %p, i8* %q) ; CHECK: call i32 @foo(i8* %p) ; CHECK-NOT: tbaa ; CHECK: %c = add i32 %a, %a @@ -12,7 +12,7 @@ define i32 @test1(i8* %p, i8* %q) { } define i32 @test2(i8* %p, i8* %q) { -; CHECK: @test2(i8* %p, i8* %q) +; CHECK-LABEL: @test2(i8* %p, i8* %q) ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGC:!.*]] ; CHECK: %c = add i32 %a, %a %a = call i32 @foo(i8* %p), !tbaa !0 @@ -22,7 +22,7 @@ define i32 @test2(i8* %p, i8* %q) { } define i32 @test3(i8* %p, i8* %q) { -; CHECK: @test3(i8* %p, i8* %q) +; CHECK-LABEL: @test3(i8* %p, i8* %q) ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGB:!.*]] ; CHECK: %c = add i32 %a, %a %a = call i32 @foo(i8* %p), !tbaa !3 @@ -32,7 +32,7 @@ define i32 @test3(i8* %p, i8* %q) { } define i32 @test4(i8* %p, i8* %q) { -; CHECK: @test4(i8* %p, i8* %q) +; CHECK-LABEL: @test4(i8* %p, i8* %q) ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]] ; CHECK: %c = add i32 %a, %a %a = call i32 @foo(i8* %p), !tbaa !1 @@ -42,8 +42,8 @@ define i32 @test4(i8* %p, i8* %q) { } define i32 @test5(i8* %p, i8* %q) { -; CHECK: @test5(i8* %p, i8* %q) -; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]] +; CHECK-LABEL: @test5(i8* %p, i8* %q) +; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]] ; CHECK: %c = add i32 %a, %a %a = call i32 @foo(i8* %p), !tbaa !0 %b = call i32 @foo(i8* %p), !tbaa !1 @@ -52,8 +52,8 @@ define i32 @test5(i8* %p, i8* %q) { } define i32 @test6(i8* %p, i8* %q) { -; CHECK: @test6(i8* %p, i8* %q) -; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]] +; CHECK-LABEL: @test6(i8* %p, i8* %q) +; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]] ; CHECK: %c = add i32 %a, %a %a = call i32 @foo(i8* %p), !tbaa !0 %b = call i32 @foo(i8* %p), !tbaa !3 @@ -62,7 +62,7 @@ define i32 @test6(i8* %p, i8* %q) { } define i32 @test7(i8* %p, i8* %q) { -; CHECK: @test7(i8* %p, i8* %q) +; CHECK-LABEL: @test7(i8* %p, i8* %q) ; CHECK: call i32 @foo(i8* %p) ; CHECK-NOT: tbaa ; CHECK: %c = add i32 %a, %a @@ -72,10 +72,8 @@ define i32 @test7(i8* %p, i8* %q) { ret i32 %c } - - define i32 @test8(i32* %p, i32* %q) { -; CHECK-LABEL: test8 +; CHECK-LABEL: @test8 ; CHECK-NEXT: store i32 15, i32* %p ; CHECK-NEXT: ret i32 0 ; Since we know the location is invariant, we can forward the @@ -87,8 +85,9 @@ define i32 @test8(i32* %p, i32* %q) { %c = sub i32 %a, %b ret i32 %c } + define i32 @test9(i32* %p, i32* %q) { -; CHECK-LABEL: test9 +; CHECK-LABEL: @test9 ; CHECK-NEXT: call void @clobber() ; CHECK-NEXT: ret i32 0 ; Since we know the location is invariant, we can forward the @@ -101,16 +100,27 @@ define i32 @test9(i32* %p, i32* %q) { ret i32 %c } +define i32 @test10(i8* %p, i8* %q) { +; If one access encloses the other, then the merged access is the enclosed one +; and not just the common final access type. +; CHECK-LABEL: @test10 +; CHECK: call i32 @foo(i8* %p), !tbaa [[TAG_X_i:!.*]] +; CHECK: %c = add i32 %a, %a + %a = call i32 @foo(i8* %p), !tbaa !15 ; TAG_X_i + %b = call i32 @foo(i8* %p), !tbaa !19 ; TAG_Y_x_i + %c = add i32 %a, %b + ret i32 %c +} declare void @clobber() declare i32 @foo(i8*) readonly -; CHECK: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0} -; CHECK: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]} -; CHECK: [[TYPEA]] = !{!"A", !{{.*}}} -; CHECK: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0} -; CHECK: [[TYPEB]] = !{!"B", [[TYPEA]]} -; CHECK: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0} +; CHECK-DAG: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0} +; CHECK-DAG: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]} +; CHECK-DAG: [[TYPEA]] = !{!"A", !{{.*}}} +; CHECK-DAG: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0} +; CHECK-DAG: [[TYPEB]] = !{!"B", [[TYPEA]]} +; CHECK-DAG: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0} !0 = !{!5, !5, i64 0} !1 = !{!6, !6, i64 0} !2 = !{!"tbaa root"} @@ -122,8 +132,17 @@ declare i32 @foo(i8*) readonly !8 = !{!"another root"} !11 = !{!"scalar type", !8} +; CHECK-DAG: [[TAG_X_i]] = !{[[TYPE_X:!.*]], [[TYPE_int:!.*]], i64 0} +; CHECK-DAG: [[TYPE_X:!.*]] = !{!"struct X", [[TYPE_int]], i64 0} +; CHECK-DAG: [[TYPE_int]] = !{!"int", {{!.*}}, i64 0} +!15 = !{!16, !17, i64 0} ; TAG_X_i +!16 = !{!"struct X", !17, i64 0} ; struct X { int i; }; +!17 = !{!"int", !18, i64 0} +!18 = !{!"char", !2, i64 0} -;; A TBAA structure who's only point is to have a constant location +!19 = !{!20, !17, i64 0} ; TAG_Y_x_i +!20 = !{!"struct Y", !16, i64 0} ; struct Y { struct X x; }; + +; A TBAA structure who's only point is to have a constant location. !9 = !{!"yet another root"} !10 = !{!"node", !9, i64 1} - diff --git a/test/Transforms/Inline/AArch64/binop.ll b/test/Transforms/Inline/AArch64/binop.ll new file mode 100644 index 000000000000..051528991e46 --- /dev/null +++ b/test/Transforms/Inline/AArch64/binop.ll @@ -0,0 +1,291 @@ +; RUN: opt -inline -mtriple=aarch64--linux-gnu -S -o - < %s -inline-threshold=0 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +declare void @pad() +@glbl = external global i32 + +define i32 @outer_add1(i32 %a) { +; CHECK-LABEL: @outer_add1( +; CHECK-NOT: call i32 @add + %C = call i32 @add(i32 %a, i32 0) + ret i32 %C +} + +define i32 @outer_add2(i32 %a) { +; CHECK-LABEL: @outer_add2( +; CHECK-NOT: call i32 @add + %C = call i32 @add(i32 0, i32 %a) + ret i32 %C +} + +define i32 @add(i32 %a, i32 %b) { + %add = add i32 %a, %b + call void @pad() + store i32 0, i32* @glbl + ret i32 %add +} + + + +define i32 @outer_sub1(i32 %a) { +; CHECK-LABEL: @outer_sub1( +; CHECK-NOT: call i32 @sub1 + %C = call i32 @sub1(i32 %a, i32 0) + ret i32 %C +} + +define i32 @sub1(i32 %a, i32 %b) { + %sub = sub i32 %a, %b + call void @pad() + store i32 0, i32* @glbl + ret i32 %sub +} + + +define i32 @outer_sub2(i32 %a) { +; CHECK-LABEL: @outer_sub2( +; CHECK-NOT: call i32 @sub2 + %C = call i32 @sub2(i32 %a) + ret i32 %C +} + +define i32 @sub2(i32 %a) { + %sub = sub i32 %a, %a + call void @pad() + ret i32 %sub +} + + + +define i32 @outer_mul1(i32 %a) { +; CHECK-LABEL: @outer_mul1( +; CHECK-NOT: call i32 @mul + %C = call i32 @mul(i32 %a, i32 0) + ret i32 %C +} + +define i32 @outer_mul2(i32 %a) { +; CHECK-LABEL: @outer_mul2( +; CHECK-NOT: call i32 @mul + %C = call i32 @mul(i32 %a, i32 1) + ret i32 %C +} + +define i32 @mul(i32 %a, i32 %b) { + %mul = mul i32 %a, %b + call void @pad() + store i32 0, i32* @glbl + ret i32 %mul +} + + + +define i32 @outer_div1(i32 %a) { +; CHECK-LABEL: @outer_div1( +; CHECK-NOT: call i32 @div1 + %C = call i32 @div1(i32 0, i32 %a) + ret i32 %C +} + +define i32 @outer_div2(i32 %a) { +; CHECK-LABEL: @outer_div2( +; CHECK-NOT: call i32 @div1 + %C = call i32 @div1(i32 %a, i32 1) + ret i32 %C +} + +define i32 @div1(i32 %a, i32 %b) { + %div = sdiv i32 %a, %b + call void @pad() + store i32 0, i32* @glbl + ret i32 %div +} + + +define i32 @outer_div3(i32 %a) { +; CHECK-LABEL: @outer_div3( +; CHECK-NOT: call i32 @div + %C = call i32 @div2(i32 %a) + ret i32 %C +} + +define i32 @div2(i32 %a) { + %div = sdiv i32 %a, %a + call void @pad() + ret i32 %div +} + + + +define i32 @outer_rem1(i32 %a) { +; CHECK-LABEL: @outer_rem1( +; CHECK-NOT: call i32 @rem + %C = call i32 @rem1(i32 0, i32 %a) + ret i32 %C +} + +define i32 @outer_rem2(i32 %a) { +; CHECK-LABEL: @outer_rem2( +; CHECK-NOT: call i32 @rem + %C = call i32 @rem1(i32 %a, i32 1) + ret i32 %C +} + +define i32 @rem1(i32 %a, i32 %b) { + %rem = urem i32 %a, %b + call void @pad() + store i32 0, i32* @glbl + ret i32 %rem +} + + +define i32 @outer_rem3(i32 %a) { +; CHECK-LABEL: @outer_rem3( +; CHECK-NOT: call i32 @rem + %C = call i32 @rem2(i32 %a) + ret i32 %C +} + +define i32 @rem2(i32 %a) { + %rem = urem i32 %a, %a + call void @pad() + ret i32 %rem +} + + + +define i32 @outer_shl1(i32 %a) { +; CHECK-LABEL: @outer_shl1( +; CHECK-NOT: call i32 @shl + %C = call i32 @shl(i32 %a, i32 0) + ret i32 %C +} + +define i32 @shl(i32 %a, i32 %b) { + %shl = shl i32 %a, %b + call void @pad() + store i32 0, i32* @glbl + ret i32 %shl +} + + + +define i32 @outer_shr1(i32 %a) { +; CHECK-LABEL: @outer_shr1( +; CHECK-NOT: call i32 @shr + %C = call i32 @shr(i32 %a, i32 0) + ret i32 %C +} + +define i32 @shr(i32 %a, i32 %b) { + %shr = ashr i32 %a, %b + call void @pad() + store i32 0, i32* @glbl + ret i32 %shr +} + + + +define i1 @outer_and1(i1 %a) { +; check-label: @outer_and1( +; check-not: call i1 @and1 + %c = call i1 @and1(i1 %a, i1 false) + ret i1 %c +} + +define i1 @outer_and2(i1 %a) { +; check-label: @outer_and2( +; check-not: call i1 @and1 + %c = call i1 @and1(i1 %a, i1 true) + ret i1 %c +} + +define i1 @and1(i1 %a, i1 %b) { + %and = and i1 %a, %b + call void @pad() + store i32 0, i32* @glbl + ret i1 %and +} + + +define i1 @outer_and3(i1 %a) { +; check-label: @outer_and3( +; check-not: call i1 @and2 + %c = call i1 @and2(i1 %a) + ret i1 %c +} + +define i1 @and2(i1 %a) { + %and = and i1 %a, %a + call void @pad() + ret i1 %and +} + + + +define i1 @outer_or1(i1 %a) { +; check-label: @outer_or1( +; check-not: call i1 @or1 + %c = call i1 @or1(i1 %a, i1 false) + ret i1 %c +} + +define i1 @outer_or2(i1 %a) { +; check-label: @outer_or2( +; check-not: call i1 @or1 + %c = call i1 @or1(i1 %a, i1 true) + ret i1 %c +} + +define i1 @or1(i1 %a, i1 %b) { + %or = or i1 %a, %b + call void @pad() + store i32 0, i32* @glbl + ret i1 %or +} + + +define i1 @outer_or3(i1 %a) { +; check-label: @outer_or3( +; check-not: call i1 @or2 + %c = call i1 @or2(i1 %a) + ret i1 %c +} + +define i1 @or2(i1 %a) { + %or = or i1 %a, %a + call void @pad() + ret i1 %or +} + + + +define i1 @outer_xor1(i1 %a) { +; check-label: @outer_xor1( +; check-not: call i1 @xor + %c = call i1 @xor1(i1 %a, i1 false) + ret i1 %c +} + +define i1 @xor1(i1 %a, i1 %b) { + %xor = xor i1 %a, %b + call void @pad() + store i32 0, i32* @glbl + ret i1 %xor +} + + +define i1 @outer_xor3(i1 %a) { +; check-label: @outer_xor3( +; check-not: call i1 @xor + %c = call i1 @xor2(i1 %a) + ret i1 %c +} + +define i1 @xor2(i1 %a) { + %xor = xor i1 %a, %a + call void @pad() + ret i1 %xor +} diff --git a/test/Transforms/Inline/ARM/inline-fp.ll b/test/Transforms/Inline/ARM/inline-fp.ll new file mode 100644 index 000000000000..b4e76dfc7d2d --- /dev/null +++ b/test/Transforms/Inline/ARM/inline-fp.ll @@ -0,0 +1,113 @@ +; RUN: opt -S -inline -mtriple=arm-eabi -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 | FileCheck %s -check-prefix=NOFP +; RUN: opt -S -inline -mtriple=arm-eabi -mattr=+vfp2 -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 | FileCheck %s -check-prefix=FULLFP +; RUN: opt -S -inline -mtriple=arm-eabi -mattr=+vfp2,+fp-only-sp -pass-remarks=.* -pass-remarks-missed=.* < %s 2>&1 | FileCheck %s -check-prefix=SINGLEFP +; Make sure that soft float implementations are calculated as being more expensive +; to the inliner. + +; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75) +; NOFP-DAG: single not inlined into test_single because too costly to inline (cost=125, threshold=75) +; NOFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75) +; NOFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75) +; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75) +; NOFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75) +; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75) +; NOFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75) + +; FULLFP-DAG: single inlined into test_single with cost=0 (threshold=75) +; FULLFP-DAG: single inlined into test_single with cost=-15000 (threshold=75) +; FULLFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75) +; FULLFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75) +; FULLFP-DAG: double inlined into test_double with cost=0 (threshold=75) +; FULLFP-DAG: double inlined into test_double with cost=-15000 (threshold=75) +; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75) +; FULLFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75) + +; SINGLEFP-DAG: single inlined into test_single with cost=0 (threshold=75) +; SINGLEFP-DAG: single inlined into test_single with cost=-15000 (threshold=75) +; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with cost=-15 (threshold=75) +; SINGLEFP-DAG: single_cheap inlined into test_single_cheap with cost=-15015 (threshold=75) +; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75) +; SINGLEFP-DAG: double not inlined into test_double because too costly to inline (cost=125, threshold=75) +; SINGLEFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75) +; SINGLEFP-DAG: single_force_soft not inlined into test_single_force_soft because too costly to inline (cost=125, threshold=75) + +define i32 @test_single(i32 %a, i8 %b, i32 %c, i8 %d) #0 { + %call = call float @single(i32 %a, i8 zeroext %b) + %call2 = call float @single(i32 %c, i8 zeroext %d) + ret i32 0 +} + +define i32 @test_single_cheap(i32 %a, i8 %b, i32 %c, i8 %d) #0 { + %call = call float @single_cheap(i32 %a, i8 zeroext %b) + %call2 = call float @single_cheap(i32 %c, i8 zeroext %d) + ret i32 0 +} + +define i32 @test_double(i32 %a, i8 %b, i32 %c, i8 %d) #0 { + %call = call double @double(i32 %a, i8 zeroext %b) + %call2 = call double @double(i32 %c, i8 zeroext %d) + ret i32 0 +} + +define i32 @test_single_force_soft(i32 %a, i8 %b, i32 %c, i8 %d) #1 { + %call = call float @single_force_soft(i32 %a, i8 zeroext %b) #1 + %call2 = call float @single_force_soft(i32 %c, i8 zeroext %d) #1 + ret i32 0 +} + +define internal float @single(i32 %response, i8 zeroext %value1) #0 { +entry: + %conv = zext i8 %value1 to i32 + %sub = add nsw i32 %conv, -1 + %conv1 = sitofp i32 %sub to float + %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1) + %mul = fmul float %0, 2.620000e+03 + %conv2 = sitofp i32 %response to float + %sub3 = fsub float %conv2, %mul + %div = fdiv float %sub3, %mul + ret float %div +} + +define internal float @single_cheap(i32 %response, i8 zeroext %value1) #0 { +entry: + %conv = zext i8 %value1 to i32 + %sub = add nsw i32 %conv, -1 + %conv1 = bitcast i32 %sub to float + %conv2 = bitcast i32 %response to float + %0 = tail call float @llvm.pow.f32(float %conv2, float %conv1) + %1 = tail call float @llvm.pow.f32(float %0, float %0) + %2 = tail call float @llvm.pow.f32(float %1, float %1) + ret float %2 +} + +define internal double @double(i32 %response, i8 zeroext %value1) #0 { +entry: + %conv = zext i8 %value1 to i32 + %sub = add nsw i32 %conv, -1 + %conv1 = sitofp i32 %sub to double + %0 = tail call double @llvm.pow.f64(double 0x3FF028F5C0000000, double %conv1) + %mul = fmul double %0, 2.620000e+03 + %conv2 = sitofp i32 %response to double + %sub3 = fsub double %conv2, %mul + %div = fdiv double %sub3, %mul + ret double %div +} + +define internal float @single_force_soft(i32 %response, i8 zeroext %value1) #1 { +entry: + %conv = zext i8 %value1 to i32 + %sub = add nsw i32 %conv, -1 + %conv1 = sitofp i32 %sub to float + %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1) + %mul = fmul float %0, 2.620000e+03 + %conv2 = sitofp i32 %response to float + %sub3 = fsub float %conv2, %mul + %div = fdiv float %sub3, %mul + ret float %div +} + +declare float @llvm.pow.f32(float, float) optsize minsize +declare double @llvm.pow.f64(double, double) optsize minsize + +attributes #0 = { optsize } +attributes #1 = { optsize "use-soft-float"="true" "target-features"="+soft-float" } diff --git a/test/Transforms/Inline/inline-fp.ll b/test/Transforms/Inline/inline-fp.ll deleted file mode 100644 index dd5972fe1b8a..000000000000 --- a/test/Transforms/Inline/inline-fp.ll +++ /dev/null @@ -1,137 +0,0 @@ -; RUN: opt -S -inline < %s | FileCheck %s -; RUN: opt -S -passes='cgscc(inline)' < %s | FileCheck %s -; Make sure that soft float implementations are calculated as being more expensive -; to the inliner. - -define i32 @test_nofp() #0 { -; f_nofp() has the "use-soft-float" attribute, so it should never get inlined. -; CHECK-LABEL: test_nofp -; CHECK: call float @f_nofp -entry: - %responseX = alloca i32, align 4 - %responseY = alloca i32, align 4 - %responseZ = alloca i32, align 4 - %valueX = alloca i8, align 1 - %valueY = alloca i8, align 1 - %valueZ = alloca i8, align 1 - - call void @getX(i32* %responseX, i8* %valueX) - call void @getY(i32* %responseY, i8* %valueY) - call void @getZ(i32* %responseZ, i8* %valueZ) - - %0 = load i32, i32* %responseX - %1 = load i8, i8* %valueX - %call = call float @f_nofp(i32 %0, i8 zeroext %1) - %2 = load i32, i32* %responseZ - %3 = load i8, i8* %valueZ - %call2 = call float @f_nofp(i32 %2, i8 zeroext %3) - %call3 = call float @fabsf(float %call) - %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000 - br i1 %cmp, label %if.end12, label %if.else - -if.else: ; preds = %entry - %4 = load i32, i32* %responseY - %5 = load i8, i8* %valueY - %call1 = call float @f_nofp(i32 %4, i8 zeroext %5) - %call4 = call float @fabsf(float %call1) - %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000 - br i1 %cmp5, label %if.end12, label %if.else7 - -if.else7: ; preds = %if.else - %call8 = call float @fabsf(float %call2) - %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000 - br i1 %cmp9, label %if.then10, label %if.end12 - -if.then10: ; preds = %if.else7 - br label %if.end12 - -if.end12: ; preds = %if.else, %entry, %if.then10, %if.else7 - %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ] - ret i32 %success.0 -} - -define i32 @test_hasfp() #0 { -; f_hasfp() does not have the "use-soft-float" attribute, so it should get inlined. -; CHECK-LABEL: test_hasfp -; CHECK-NOT: call float @f_hasfp -entry: - %responseX = alloca i32, align 4 - %responseY = alloca i32, align 4 - %responseZ = alloca i32, align 4 - %valueX = alloca i8, align 1 - %valueY = alloca i8, align 1 - %valueZ = alloca i8, align 1 - - call void @getX(i32* %responseX, i8* %valueX) - call void @getY(i32* %responseY, i8* %valueY) - call void @getZ(i32* %responseZ, i8* %valueZ) - - %0 = load i32, i32* %responseX - %1 = load i8, i8* %valueX - %call = call float @f_hasfp(i32 %0, i8 zeroext %1) - %2 = load i32, i32* %responseZ - %3 = load i8, i8* %valueZ - %call2 = call float @f_hasfp(i32 %2, i8 zeroext %3) - %call3 = call float @fabsf(float %call) - %cmp = fcmp ogt float %call3, 0x3FC1EB8520000000 - br i1 %cmp, label %if.end12, label %if.else - -if.else: ; preds = %entry - %4 = load i32, i32* %responseY - %5 = load i8, i8* %valueY - %call1 = call float @f_hasfp(i32 %4, i8 zeroext %5) - %call4 = call float @fabsf(float %call1) - %cmp5 = fcmp ogt float %call4, 0x3FC1EB8520000000 - br i1 %cmp5, label %if.end12, label %if.else7 - -if.else7: ; preds = %if.else - %call8 = call float @fabsf(float %call2) - %cmp9 = fcmp ogt float %call8, 0x3FC1EB8520000000 - br i1 %cmp9, label %if.then10, label %if.end12 - -if.then10: ; preds = %if.else7 - br label %if.end12 - -if.end12: ; preds = %if.else, %entry, %if.then10, %if.else7 - %success.0 = phi i32 [ 0, %if.then10 ], [ 1, %if.else7 ], [ 0, %entry ], [ 0, %if.else ] - ret i32 %success.0 -} - -declare void @getX(i32*, i8*) #0 - -declare void @getY(i32*, i8*) #0 - -declare void @getZ(i32*, i8*) #0 - -define internal float @f_hasfp(i32 %response, i8 zeroext %value1) #0 { -entry: - %conv = zext i8 %value1 to i32 - %sub = add nsw i32 %conv, -1 - %conv1 = sitofp i32 %sub to float - %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1) - %mul = fmul float %0, 2.620000e+03 - %conv2 = sitofp i32 %response to float - %sub3 = fsub float %conv2, %mul - %div = fdiv float %sub3, %mul - ret float %div -} - -define internal float @f_nofp(i32 %response, i8 zeroext %value1) #1 { -entry: - %conv = zext i8 %value1 to i32 - %sub = add nsw i32 %conv, -1 - %conv1 = sitofp i32 %sub to float - %0 = tail call float @llvm.pow.f32(float 0x3FF028F5C0000000, float %conv1) - %mul = fmul float %0, 2.620000e+03 - %conv2 = sitofp i32 %response to float - %sub3 = fsub float %conv2, %mul - %div = fdiv float %sub3, %mul - ret float %div -} - -declare float @fabsf(float) optsize minsize - -declare float @llvm.pow.f32(float, float) optsize minsize - -attributes #0 = { optsize } -attributes #1 = { optsize "use-soft-float"="true" } diff --git a/test/Transforms/Inline/redundant-loads.ll b/test/Transforms/Inline/redundant-loads.ll index 6b89f1db484b..176f605fc73b 100644 --- a/test/Transforms/Inline/redundant-loads.ll +++ b/test/Transforms/Inline/redundant-loads.ll @@ -184,3 +184,21 @@ define void @inner9(i32* %a, void ()* %f) { call void @pad() ret void } + + +define void @outer10(i32* %a) { +; CHECK-LABEL: @outer10( +; CHECK: call void @inner10 + %b = alloca i32 + call void @inner10(i32* %a, i32* %b) + ret void +} + +define void @inner10(i32* %a, i32* %b) { + %1 = load i32, i32* %a + store i32 %1, i32 * %b + %2 = load volatile i32, i32* %a ; volatile load should be kept. + call void @pad() + %3 = load volatile i32, i32* %a ; Same as the above. + ret void +} diff --git a/test/Transforms/InstCombine/2011-09-03-Trampoline.ll b/test/Transforms/InstCombine/2011-09-03-Trampoline.ll index 1833558cbceb..7a315094a04e 100644 --- a/test/Transforms/InstCombine/2011-09-03-Trampoline.ll +++ b/test/Transforms/InstCombine/2011-09-03-Trampoline.ll @@ -5,18 +5,18 @@ declare i8* @llvm.adjust.trampoline(i8*) declare i32 @f(i8 * nest, i32) ; Most common case -define i32 @test0(i32 %n) { +define i32 @test0(i32 %n) !dbg !4 { %alloca = alloca [10 x i8], align 16 %gep = getelementptr [10 x i8], [10 x i8]* %alloca, i32 0, i32 0 call void @llvm.init.trampoline(i8* %gep, i8* bitcast (i32 (i8*, i32)* @f to i8*), i8* null) %tramp = call i8* @llvm.adjust.trampoline(i8* %gep) %function = bitcast i8* %tramp to i32(i32)* - %ret = call i32 %function(i32 %n) + %ret = call i32 %function(i32 %n), !dbg !10 ret i32 %ret -; CHECK: define i32 @test0(i32 %n) { -; CHECK: %ret = call i32 @f(i8* nest null, i32 %n) +; CHECK: define i32 @test0(i32 %n) !dbg !4 { +; CHECK: %ret = call i32 @f(i8* nest null, i32 %n), !dbg !10 } define i32 @test1(i32 %n, i8* %trampmem) { @@ -85,3 +85,18 @@ define i32 @test4(i32 %n) { ; CHECK: %ret1 = call i32 @f(i8* nest null, i32 %n) ; CHECK: %ret2 = call i32 @f(i8* nest null, i32 %n) } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.0 (trunk 127710)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2) +!1 = !DIFile(filename: "string.h", directory: "Game") +!2 = !{} +!3 = !{i32 1, !"Debug Info Version", i32 3} +!4 = distinct !DISubprogram(name: "passthru", scope: !1, file: !1, line: 79, type: !5, isLocal: true, isDefinition: true, scopeLine: 79, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !8) +!5 = !DISubroutineType(types: !6) +!6 = !{!7} +!7 = !DIDerivedType(tag: DW_TAG_pointer_type, scope: !0, baseType: null, size: 64, align: 64) +!8 = !{!9} +!9 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 78, type: !7) +!10 = !DILocation(line: 78, column: 28, scope: !4) diff --git a/test/Transforms/JumpThreading/guards.ll b/test/Transforms/JumpThreading/guards.ll index 53175a7b7253..c760283f9e52 100644 --- a/test/Transforms/JumpThreading/guards.ll +++ b/test/Transforms/JumpThreading/guards.ll @@ -278,3 +278,106 @@ L2: L3: ret void } + +; Make sure that we don't PRE a non-speculable load across a guard. +define void @unsafe_pre_across_guard(i8* %p, i1 %load.is.valid) { + +; CHECK-LABEL: @unsafe_pre_across_guard( +; CHECK-NOT: loaded.pr +; CHECK: entry: +; CHECK-NEXT: br label %loop +; CHECK: loop: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ] +; CHECK-NEXT: %loaded = load i8, i8* %p +; CHECK-NEXT: %continue = icmp eq i8 %loaded, 0 +; CHECK-NEXT: br i1 %continue, label %exit, label %loop +entry: + br label %loop + +loop: ; preds = %loop, %entry + call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ] + %loaded = load i8, i8* %p + %continue = icmp eq i8 %loaded, 0 + br i1 %continue, label %exit, label %loop + +exit: ; preds = %loop + ret void +} + +; Make sure that we can safely PRE a speculable load across a guard. +define void @safe_pre_across_guard(i8* noalias nocapture readonly dereferenceable(8) %p, i1 %load.is.valid) { + +; CHECK-LABEL: @safe_pre_across_guard( +; CHECK: entry: +; CHECK-NEXT: %loaded.pr = load i8, i8* %p +; CHECK-NEXT: br label %loop +; CHECK: loop: +; CHECK-NEXT: %loaded = phi i8 [ %loaded, %loop ], [ %loaded.pr, %entry ] +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ] +; CHECK-NEXT: %continue = icmp eq i8 %loaded, 0 +; CHECK-NEXT: br i1 %continue, label %exit, label %loop + +entry: + br label %loop + +loop: ; preds = %loop, %entry + call void (i1, ...) @llvm.experimental.guard(i1 %load.is.valid) [ "deopt"() ] + %loaded = load i8, i8* %p + %continue = icmp eq i8 %loaded, 0 + br i1 %continue, label %exit, label %loop + +exit: ; preds = %loop + ret void +} + +; Make sure that we don't PRE a non-speculable load across a call which may +; alias with the load. +define void @unsafe_pre_across_call(i8* %p) { + +; CHECK-LABEL: @unsafe_pre_across_call( +; CHECK-NOT: loaded.pr +; CHECK: entry: +; CHECK-NEXT: br label %loop +; CHECK: loop: +; CHECK-NEXT: call i32 @f1() +; CHECK-NEXT: %loaded = load i8, i8* %p +; CHECK-NEXT: %continue = icmp eq i8 %loaded, 0 +; CHECK-NEXT: br i1 %continue, label %exit, label %loop +entry: + br label %loop + +loop: ; preds = %loop, %entry + call i32 @f1() + %loaded = load i8, i8* %p + %continue = icmp eq i8 %loaded, 0 + br i1 %continue, label %exit, label %loop + +exit: ; preds = %loop + ret void +} + +; Make sure that we can safely PRE a speculable load across a call. +define void @safe_pre_across_call(i8* noalias nocapture readonly dereferenceable(8) %p) { + +; CHECK-LABEL: @safe_pre_across_call( +; CHECK: entry: +; CHECK-NEXT: %loaded.pr = load i8, i8* %p +; CHECK-NEXT: br label %loop +; CHECK: loop: +; CHECK-NEXT: %loaded = phi i8 [ %loaded, %loop ], [ %loaded.pr, %entry ] +; CHECK-NEXT: call i32 @f1() +; CHECK-NEXT: %continue = icmp eq i8 %loaded, 0 +; CHECK-NEXT: br i1 %continue, label %exit, label %loop + +entry: + br label %loop + +loop: ; preds = %loop, %entry + call i32 @f1() + %loaded = load i8, i8* %p + %continue = icmp eq i8 %loaded, 0 + br i1 %continue, label %exit, label %loop + +exit: ; preds = %loop + ret void +} diff --git a/test/Transforms/LoopVectorize/legal_preheader_check.ll b/test/Transforms/LoopVectorize/legal_preheader_check.ll new file mode 100644 index 000000000000..32aa796394d6 --- /dev/null +++ b/test/Transforms/LoopVectorize/legal_preheader_check.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -loop-vectorize -debug -S -o /dev/null 2>&1 | FileCheck %s +; REQUIRES: asserts + +; D40973 +; Make sure LV legal bails out when the loop doesn't have a legal pre-header. + +; CHECK: LV: Loop doesn't have a legal pre-header. + +define void @inc(i32 %n, i8* %P) { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %BB1, label %BB2 + +BB1: + indirectbr i8* %P, [label %.lr.ph] + +BB2: + br label %.lr.ph + +.lr.ph: + %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %BB1 ], [ 0, %BB2 ] + %indvars.iv.next = add i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: + ret void +} diff --git a/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll b/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll new file mode 100644 index 000000000000..e3d1f6dd2b17 --- /dev/null +++ b/test/Transforms/MemCpyOpt/memcpy-invoke-memcpy.ll @@ -0,0 +1,48 @@ +; RUN: opt < %s -memcpyopt -S | FileCheck %s +; Test memcpy-memcpy dependencies across invoke edges. + +; Test that memcpyopt works across the non-unwind edge of an invoke. + +define hidden void @test_normal(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %temp = alloca i8, i32 64 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false) + invoke void @invoke_me() + to label %try.cont unwind label %lpad + +lpad: + landingpad { i8*, i32 } + catch i8* null + ret void + +try.cont: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false) + ret void +} + +; Test that memcpyopt works across the unwind edge of an invoke. + +define hidden void @test_unwind(i8* noalias %dst, i8* %src) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +entry: + %temp = alloca i8, i32 64 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false) + invoke void @invoke_me() + to label %try.cont unwind label %lpad + +lpad: + landingpad { i8*, i32 } + catch i8* null + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false) + ret void + +try.cont: + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) +declare i32 @__gxx_personality_v0(...) +declare void @invoke_me() readnone diff --git a/test/Transforms/MemCpyOpt/merge-into-memset.ll b/test/Transforms/MemCpyOpt/merge-into-memset.ll new file mode 100644 index 000000000000..fc31038a4e6d --- /dev/null +++ b/test/Transforms/MemCpyOpt/merge-into-memset.ll @@ -0,0 +1,45 @@ +; RUN: opt < %s -memcpyopt -S | FileCheck %s +; Update cached non-local dependence information when merging stores into memset. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Don't delete the memcpy in %if.then, even though it depends on an instruction +; which will be deleted. + +; CHECK-LABEL: @foo +define void @foo(i1 %c, i8* %d, i8* %e, i8* %f) { +entry: + %tmp = alloca [50 x i8], align 8 + %tmp4 = bitcast [50 x i8]* %tmp to i8* + %tmp1 = getelementptr inbounds i8, i8* %tmp4, i64 1 + call void @llvm.memset.p0i8.i64(i8* nonnull %d, i8 0, i64 10, i32 1, i1 false), !dbg !5 + store i8 0, i8* %tmp4, align 8, !dbg !5 +; CHECK: call void @llvm.memset.p0i8.i64(i8* nonnull %d, i8 0, i64 10, i32 1, i1 false), !dbg !5 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %tmp1, i8* nonnull %d, i64 10, i32 1, i1 false) + br i1 %c, label %if.then, label %exit + +if.then: +; CHECK: if.then: +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %f, i8* nonnull %tmp4, i64 30, i32 8, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %f, i8* nonnull %tmp4, i64 30, i32 8, i1 false) + br label %exit + +exit: + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1) +declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} + +!0 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "t.rs", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !DILocation(line: 8, column: 5, scope: !6) +!6 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 5, type: !7, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) +!7 = !DISubroutineType(types: !8) +!8 = !{null} diff --git a/test/Transforms/MemCpyOpt/mixed-sizes.ll b/test/Transforms/MemCpyOpt/mixed-sizes.ll new file mode 100644 index 000000000000..9091fe7f56c0 --- /dev/null +++ b/test/Transforms/MemCpyOpt/mixed-sizes.ll @@ -0,0 +1,36 @@ +; RUN: opt < %s -memcpyopt -S | FileCheck %s +; Handle memcpy-memcpy dependencies of differing sizes correctly. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Don't delete the second memcpy, even though there's an earlier +; memcpy with a larger size from the same address. + +; CHECK-LABEL: @foo +define i32 @foo(i1 %z) { +entry: + %a = alloca [10 x i32] + %s = alloca [10 x i32] + %0 = bitcast [10 x i32]* %a to i8* + %1 = bitcast [10 x i32]* %s to i8* + call void @llvm.memset.p0i8.i64(i8* nonnull %1, i8 0, i64 40, i32 16, i1 false) + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %a, i64 0, i64 0 + store i32 1, i32* %arrayidx + %scevgep = getelementptr [10 x i32], [10 x i32]* %s, i64 0, i64 1 + %scevgep7 = bitcast i32* %scevgep to i8* + br i1 %z, label %for.body3.lr.ph, label %for.inc7.1 + +for.body3.lr.ph: ; preds = %entry + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 17179869180, i32 4, i1 false) + br label %for.inc7.1 + +for.inc7.1: +; CHECK: for.inc7.1: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 4, i32 4, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %scevgep7, i64 4, i32 4, i1 false) + %2 = load i32, i32* %arrayidx + ret i32 %2 +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1) +declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1) diff --git a/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll b/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll new file mode 100644 index 000000000000..5b0510211d9f --- /dev/null +++ b/test/Transforms/MemCpyOpt/nonlocal-memcpy-memcpy.ll @@ -0,0 +1,114 @@ +; RUN: opt < %s -memcpyopt -S | FileCheck %s +; Make sure memcpy-memcpy dependence is optimized across +; basic blocks (conditional branches and invokes). + +%struct.s = type { i32, i32 } + +@s_foo = private unnamed_addr constant %struct.s { i32 1, i32 2 }, align 4 +@s_baz = private unnamed_addr constant %struct.s { i32 1, i32 2 }, align 4 +@i = external constant i8* + +declare void @qux() +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) +declare void @__cxa_throw(i8*, i8*, i8*) +declare i32 @__gxx_personality_v0(...) +declare i8* @__cxa_begin_catch(i8*) + +; A simple partial redundancy. Test that the second memcpy is optimized +; to copy directly from the original source rather than from the temporary. + +; CHECK-LABEL: @wobble +define void @wobble(i8* noalias %dst, i8* %src, i1 %some_condition) { +bb: + %temp = alloca i8, i32 64 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %temp, i8* nonnull %src, i64 64, i32 8, i1 false) + br i1 %some_condition, label %more, label %out + +out: + call void @qux() + unreachable + +more: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %temp, i64 64, i32 8, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 64, i32 8, i1 false) + ret void +} + +; A CFG triangle with a partial redundancy targeting an alloca. Test that the +; memcpy inside the triangle is optimized to copy directly from the original +; source rather than from the temporary. + +; CHECK-LABEL: @foo +define i32 @foo(i1 %t3) { +bb: + %s = alloca %struct.s, align 4 + %t = alloca %struct.s, align 4 + %s1 = bitcast %struct.s* %s to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s1, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s1, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false) + br i1 %t3, label %bb4, label %bb7 + +bb4: ; preds = %bb + %t5 = bitcast %struct.s* %t to i8* + %s6 = bitcast %struct.s* %s to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t5, i8* %s6, i64 8, i32 4, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t5, i8* bitcast (%struct.s* @s_foo to i8*), i64 8, i32 4, i1 false) + br label %bb7 + +bb7: ; preds = %bb4, %bb + %t8 = getelementptr %struct.s, %struct.s* %t, i32 0, i32 0 + %t9 = load i32, i32* %t8, align 4 + %t10 = getelementptr %struct.s, %struct.s* %t, i32 0, i32 1 + %t11 = load i32, i32* %t10, align 4 + %t12 = add i32 %t9, %t11 + ret i32 %t12 +} + +; A CFG diamond with an invoke on one side, and a partially redundant memcpy +; into an alloca on the other. Test that the memcpy inside the diamond is +; optimized to copy ; directly from the original source rather than from the +; temporary. This more complex test represents a relatively common usage +; pattern. + +; CHECK-LABEL: @baz +define i32 @baz(i1 %t5) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +bb: + %s = alloca %struct.s, align 4 + %t = alloca %struct.s, align 4 + %s3 = bitcast %struct.s* %s to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s3, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %s3, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false) + br i1 %t5, label %bb6, label %bb22 + +bb6: ; preds = %bb + invoke void @__cxa_throw(i8* null, i8* bitcast (i8** @i to i8*), i8* null) + to label %bb25 unwind label %bb9 + +bb9: ; preds = %bb6 + %t10 = landingpad { i8*, i32 } + catch i8* null + br label %bb13 + +bb13: ; preds = %bb9 + %t15 = call i8* @__cxa_begin_catch(i8* null) + br label %bb23 + +bb22: ; preds = %bb + %t23 = bitcast %struct.s* %t to i8* + %s24 = bitcast %struct.s* %s to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t23, i8* %s24, i64 8, i32 4, i1 false) +; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %t23, i8* bitcast (%struct.s* @s_baz to i8*), i64 8, i32 4, i1 false) + br label %bb23 + +bb23: ; preds = %bb22, %bb13 + %t17 = getelementptr inbounds %struct.s, %struct.s* %t, i32 0, i32 0 + %t18 = load i32, i32* %t17, align 4 + %t19 = getelementptr inbounds %struct.s, %struct.s* %t, i32 0, i32 1 + %t20 = load i32, i32* %t19, align 4 + %t21 = add nsw i32 %t18, %t20 + ret i32 %t21 + +bb25: ; preds = %bb6 + unreachable +} diff --git a/test/Transforms/NewGVN/tbaa.ll b/test/Transforms/NewGVN/tbaa.ll index 3dcc4f8acc14..d48ededac03a 100644 --- a/test/Transforms/NewGVN/tbaa.ll +++ b/test/Transforms/NewGVN/tbaa.ll @@ -1,7 +1,7 @@ ; RUN: opt -tbaa -basicaa -newgvn -S < %s | FileCheck %s define i32 @test1(i8* %p, i8* %q) { -; CHECK: @test1(i8* %p, i8* %q) +; CHECK-LABEL: @test1(i8* %p, i8* %q) ; CHECK: call i32 @foo(i8* %p) ; CHECK-NOT: tbaa ; CHECK: %c = add i32 %a, %a @@ -12,7 +12,7 @@ define i32 @test1(i8* %p, i8* %q) { } define i32 @test2(i8* %p, i8* %q) { -; CHECK: @test2(i8* %p, i8* %q) +; CHECK-LABEL: @test2(i8* %p, i8* %q) ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGC:!.*]] ; CHECK: %c = add i32 %a, %a %a = call i32 @foo(i8* %p), !tbaa !0 @@ -22,7 +22,7 @@ define i32 @test2(i8* %p, i8* %q) { } define i32 @test3(i8* %p, i8* %q) { -; CHECK: @test3(i8* %p, i8* %q) +; CHECK-LABEL: @test3(i8* %p, i8* %q) ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGB:!.*]] ; CHECK: %c = add i32 %a, %a %a = call i32 @foo(i8* %p), !tbaa !3 @@ -32,7 +32,7 @@ define i32 @test3(i8* %p, i8* %q) { } define i32 @test4(i8* %p, i8* %q) { -; CHECK: @test4(i8* %p, i8* %q) +; CHECK-LABEL: @test4(i8* %p, i8* %q) ; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]] ; CHECK: %c = add i32 %a, %a %a = call i32 @foo(i8* %p), !tbaa !1 @@ -42,8 +42,8 @@ define i32 @test4(i8* %p, i8* %q) { } define i32 @test5(i8* %p, i8* %q) { -; CHECK: @test5(i8* %p, i8* %q) -; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]] +; CHECK-LABEL: @test5(i8* %p, i8* %q) +; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]] ; CHECK: %c = add i32 %a, %a %a = call i32 @foo(i8* %p), !tbaa !0 %b = call i32 @foo(i8* %p), !tbaa !1 @@ -52,8 +52,8 @@ define i32 @test5(i8* %p, i8* %q) { } define i32 @test6(i8* %p, i8* %q) { -; CHECK: @test6(i8* %p, i8* %q) -; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA:!.*]] +; CHECK-LABEL: @test6(i8* %p, i8* %q) +; CHECK: call i32 @foo(i8* %p), !tbaa [[TAGA]] ; CHECK: %c = add i32 %a, %a %a = call i32 @foo(i8* %p), !tbaa !0 %b = call i32 @foo(i8* %p), !tbaa !3 @@ -62,7 +62,7 @@ define i32 @test6(i8* %p, i8* %q) { } define i32 @test7(i8* %p, i8* %q) { -; CHECK: @test7(i8* %p, i8* %q) +; CHECK-LABEL: @test7(i8* %p, i8* %q) ; CHECK: call i32 @foo(i8* %p) ; CHECK-NOT: tbaa ; CHECK: %c = add i32 %a, %a @@ -72,10 +72,8 @@ define i32 @test7(i8* %p, i8* %q) { ret i32 %c } - - define i32 @test8(i32* %p, i32* %q) { -; CHECK-LABEL: test8 +; CHECK-LABEL: @test8 ; CHECK-NEXT: store i32 15, i32* %p ; CHECK-NEXT: ret i32 0 ; Since we know the location is invariant, we can forward the @@ -87,8 +85,9 @@ define i32 @test8(i32* %p, i32* %q) { %c = sub i32 %a, %b ret i32 %c } + define i32 @test9(i32* %p, i32* %q) { -; CHECK-LABEL: test9 +; CHECK-LABEL: @test9 ; CHECK-NEXT: call void @clobber() ; CHECK-NEXT: ret i32 0 ; Since we know the location is invariant, we can forward the @@ -101,16 +100,27 @@ define i32 @test9(i32* %p, i32* %q) { ret i32 %c } +define i32 @test10(i8* %p, i8* %q) { +; If one access encloses the other, then the merged access is the enclosed one +; and not just the common final access type. +; CHECK-LABEL: @test10 +; CHECK: call i32 @foo(i8* %p), !tbaa [[TAG_X_i:!.*]] +; CHECK: %c = add i32 %a, %a + %a = call i32 @foo(i8* %p), !tbaa !15 ; TAG_X_i + %b = call i32 @foo(i8* %p), !tbaa !19 ; TAG_Y_x_i + %c = add i32 %a, %b + ret i32 %c +} declare void @clobber() declare i32 @foo(i8*) readonly -; CHECK: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0} -; CHECK: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]} -; CHECK: [[TYPEA]] = !{!"A", !{{.*}}} -; CHECK: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0} -; CHECK: [[TYPEB]] = !{!"B", [[TYPEA]]} -; CHECK: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0} +; CHECK-DAG: [[TAGC]] = !{[[TYPEC:!.*]], [[TYPEC]], i64 0} +; CHECK-DAG: [[TYPEC]] = !{!"C", [[TYPEA:!.*]]} +; CHECK-DAG: [[TYPEA]] = !{!"A", !{{.*}}} +; CHECK-DAG: [[TAGB]] = !{[[TYPEB:!.*]], [[TYPEB]], i64 0} +; CHECK-DAG: [[TYPEB]] = !{!"B", [[TYPEA]]} +; CHECK-DAG: [[TAGA]] = !{[[TYPEA]], [[TYPEA]], i64 0} !0 = !{!5, !5, i64 0} !1 = !{!6, !6, i64 0} !2 = !{!"tbaa root"} @@ -122,8 +132,17 @@ declare i32 @foo(i8*) readonly !8 = !{!"another root"} !11 = !{!"scalar type", !8} +; CHECK-DAG: [[TAG_X_i]] = !{[[TYPE_X:!.*]], [[TYPE_int:!.*]], i64 0} +; CHECK-DAG: [[TYPE_X:!.*]] = !{!"struct X", [[TYPE_int]], i64 0} +; CHECK-DAG: [[TYPE_int]] = !{!"int", {{!.*}}, i64 0} +!15 = !{!16, !17, i64 0} ; TAG_X_i +!16 = !{!"struct X", !17, i64 0} ; struct X { int i; }; +!17 = !{!"int", !18, i64 0} +!18 = !{!"char", !2, i64 0} -;; A TBAA structure who's only point is to have a constant location +!19 = !{!20, !17, i64 0} ; TAG_Y_x_i +!20 = !{!"struct Y", !16, i64 0} ; struct Y { struct X x; }; + +; A TBAA structure who's only point is to have a constant location. !9 = !{!"yet another root"} !10 = !{!"node", !9, i64 1} - diff --git a/test/Transforms/PGOProfile/icp_covariant_call_return.ll b/test/Transforms/PGOProfile/icp_covariant_call_return.ll index fc5054e3a574..aba075461deb 100644 --- a/test/Transforms/PGOProfile/icp_covariant_call_return.ll +++ b/test/Transforms/PGOProfile/icp_covariant_call_return.ll @@ -22,8 +22,7 @@ entry: %vtable = load %struct.Base* (%struct.B*)**, %struct.Base* (%struct.B*)*** %tmp2, align 8 %vfn = getelementptr inbounds %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vtable, i64 0 %tmp3 = load %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vfn, align 8 -; ICALL-PROM: [[BITCAST:%[0-9]+]] = bitcast %struct.Base* (%struct.B*)* %tmp3 to i8* -; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to i8*) +; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq %struct.Base* (%struct.B*)* %tmp3, bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to %struct.Base* (%struct.B*)*) ; ICALL-PROM: br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]] ; ICALL-PROM:if.true.direct_targ: ; ICALL-PROM: [[ARG_BITCAST:%[0-9]+]] = bitcast %struct.B* %tmp1 to %struct.D* diff --git a/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll b/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll index d2ff47dda0e6..0a4444783eb0 100644 --- a/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll +++ b/test/Transforms/PGOProfile/icp_covariant_invoke_return.ll @@ -32,18 +32,19 @@ invoke.cont: %vtable = load %struct.Base* (%struct.B*)**, %struct.Base* (%struct.B*)*** %tmp2, align 8 %vfn = getelementptr inbounds %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vtable, i64 0 %tmp3 = load %struct.Base* (%struct.B*)*, %struct.Base* (%struct.B*)** %vfn, align 8 -; ICALL-PROM: [[BITCAST:%[0-9]+]] = bitcast %struct.Base* (%struct.B*)* %tmp3 to i8* -; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to i8*) +; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq %struct.Base* (%struct.B*)* %tmp3, bitcast (%struct.Derived* (%struct.D*)* @_ZN1D4funcEv to %struct.Base* (%struct.B*)*) ; ICALL-PROM: br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]] ; ICALL-PROM:if.true.direct_targ: ; ICALL-PROM: [[ARG_BITCAST:%[0-9]+]] = bitcast %struct.B* %tmp1 to %struct.D* ; ICALL-PROM: [[DIRCALL_RET:%[0-9]+]] = invoke %struct.Derived* @_ZN1D4funcEv(%struct.D* [[ARG_BITCAST]]) -; ICALL-PROM: to label %if.end.icp unwind label %lpad +; ICALL-PROM: to label %if.true.direct_targ.if.end.icp_crit_edge unwind label %lpad +; ICALL-PROM:if.true.direct_targ.if.end.icp_crit_edge: +; ICALL-PROM: [[DIRCALL_RET_CAST:%[0-9]+]] = bitcast %struct.Derived* [[DIRCALL_RET]] to %struct.Base* +; ICALL-PROM: br label %if.end.icp ; ICALL-PROM:if.false.orig_indirect: ; ICAll-PROM: %call2 = invoke %struct.Base* %tmp3(%struct.B* %tmp1) ; ICAll-PROM: to label %invoke.cont1 unwind label %lpad ; ICALL-PROM:if.end.icp: -; ICALL-PROM: [[DIRCALL_RET_CAST:%[0-9]+]] = bitcast %struct.Derived* [[DIRCALL_RET]] to %struct.Base* ; ICALL-PROM: br label %invoke.cont1 %call2 = invoke %struct.Base* %tmp3(%struct.B* %tmp1) to label %invoke.cont1 unwind label %lpad, !prof !1 diff --git a/test/Transforms/PGOProfile/icp_invoke.ll b/test/Transforms/PGOProfile/icp_invoke.ll index 2ec564627aa1..1cacc1bc1aca 100644 --- a/test/Transforms/PGOProfile/icp_invoke.ll +++ b/test/Transforms/PGOProfile/icp_invoke.ll @@ -20,8 +20,7 @@ entry: define i32 @_Z3goov() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { entry: %tmp = load void ()*, void ()** @foo1, align 8 -; ICP: [[BITCAST_IC1:%[0-9]+]] = bitcast void ()* %tmp to i8* -; ICP: [[CMP_IC1:%[0-9]+]] = icmp eq i8* [[BITCAST_IC1]], bitcast (void ()* @_ZL4bar1v to i8*) +; ICP: [[CMP_IC1:%[0-9]+]] = icmp eq void ()* %tmp, @_ZL4bar1v ; ICP: br i1 [[CMP_IC1]], label %[[TRUE_LABEL_IC1:.*]], label %[[FALSE_LABEL_IC1:.*]], !prof [[BRANCH_WEIGHT:![0-9]+]] ; ICP:[[TRUE_LABEL_IC1]]: ; ICP: invoke void @_ZL4bar1v() @@ -49,17 +48,19 @@ catch: try.cont: %tmp6 = load i32 ()*, i32 ()** @foo2, align 8 -; ICP: [[BITCAST_IC2:%[0-9]+]] = bitcast i32 ()* %tmp6 to i8* -; ICP: [[CMP_IC2:%[0-9]+]] = icmp eq i8* [[BITCAST_IC2]], bitcast (i32 ()* @_ZL4bar2v to i8*) +; ICP: [[CMP_IC2:%[0-9]+]] = icmp eq i32 ()* %tmp6, @_ZL4bar2v ; ICP: br i1 [[CMP_IC2]], label %[[TRUE_LABEL_IC2:.*]], label %[[FALSE_LABEL_IC2:.*]], !prof [[BRANCH_WEIGHT:![0-9]+]] ; ICP:[[TRUE_LABEL_IC2]]: -; ICP: [[RESULT_IC2:%[0-9]+]] = invoke i32 @_ZL4bar2v() -; ICP: to label %[[DCALL_NORMAL_DEST_IC2:.*]] unwind label %lpad1 +; ICP: [[RESULT_IC2_0:%[0-9]+]] = invoke i32 @_ZL4bar2v() +; ICP: to label %[[MERGE_BB:.*]] unwind label %lpad1 ; ICP:[[FALSE_LABEL_IC2]]: +; ICP: [[RESULT_IC2_1:%.+]] = invoke i32 %tmp6() +; ICP: to label %[[MERGE_BB]] unwind label %lpad1 %call = invoke i32 %tmp6() to label %try.cont8 unwind label %lpad1, !prof !3 -; ICP:[[DCALL_NORMAL_DEST_IC2]]: +; ICP:[[MERGE_BB]]: +; ICP: [[MERGE_PHI:%.+]] = phi i32 [ [[RESULT_IC2_1]], %[[FALSE_LABEL_IC2]] ], [ [[RESULT_IC2_0]], %[[TRUE_LABEL_IC2]] ] ; ICP: br label %try.cont8 lpad1: %tmp7 = landingpad { i8*, i32 } @@ -77,7 +78,7 @@ catch6: try.cont8: %i.0 = phi i32 [ undef, %catch6 ], [ %call, %try.cont ] -; ICP: %i.0 = phi i32 [ undef, %catch6 ], [ %call, %[[FALSE_LABEL_IC2]] ], [ [[RESULT_IC2]], %[[DCALL_NORMAL_DEST_IC2]] ] +; ICP: %i.0 = phi i32 [ undef, %catch6 ], [ [[MERGE_PHI]], %[[MERGE_BB]] ] ret i32 %i.0 eh.resume: diff --git a/test/Transforms/PGOProfile/icp_invoke_nouse.ll b/test/Transforms/PGOProfile/icp_invoke_nouse.ll index 5a1e6358cb61..096d2e0f222e 100644 --- a/test/Transforms/PGOProfile/icp_invoke_nouse.ll +++ b/test/Transforms/PGOProfile/icp_invoke_nouse.ll @@ -18,8 +18,7 @@ entry: if.end: ; preds = %entry %fptr = load i32 ()*, i32 ()** @pfptr, align 8 -; ICP: [[BITCAST_IC1:%[0-9]+]] = bitcast i32 ()* %fptr to i8* -; ICP: [[CMP_IC1:%[0-9]+]] = icmp eq i8* [[BITCAST_IC1]], bitcast (i32 ()* @_ZL4bar1v to i8*) +; ICP: [[CMP_IC1:%[0-9]+]] = icmp eq i32 ()* %fptr, @_ZL4bar1v ; ICP: br i1 [[CMP_IC1]], label %[[TRUE_LABEL_IC1:.*]], label %[[FALSE_LABEL_IC1:.*]], !prof [[BRANCH_WEIGHT:![0-9]+]] ; ICP:[[TRUE_LABEL_IC1]]: ; ICP: invoke i32 @_ZL4bar1v() diff --git a/test/Transforms/PGOProfile/icp_vararg.ll b/test/Transforms/PGOProfile/icp_vararg.ll index 400aab3aead7..ec243470290a 100644 --- a/test/Transforms/PGOProfile/icp_vararg.ll +++ b/test/Transforms/PGOProfile/icp_vararg.ll @@ -13,8 +13,7 @@ entry: define i32 @bar() #1 { entry: %tmp = load i32 (i32, ...)*, i32 (i32, ...)** @foo, align 8 -; ICALL-PROM: [[BITCAST:%[0-9]+]] = bitcast i32 (i32, ...)* %tmp to i8* -; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (i32 (i32, ...)* @va_func to i8*) +; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq i32 (i32, ...)* %tmp, @va_func ; ICALL-PROM: br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]] ; ICALL-PROM:if.true.direct_targ: ; ICALL-PROM: [[DIRCALL_RET:%[0-9]+]] = call i32 (i32, ...) @va_func(i32 3, i32 12, i32 22, i32 4) diff --git a/test/Transforms/PGOProfile/indirect_call_promotion.ll b/test/Transforms/PGOProfile/indirect_call_promotion.ll index 6832fecfaed3..85df5260f199 100644 --- a/test/Transforms/PGOProfile/indirect_call_promotion.ll +++ b/test/Transforms/PGOProfile/indirect_call_promotion.ll @@ -43,8 +43,7 @@ entry: define i32 @bar() { entry: %tmp = load i32 ()*, i32 ()** @foo, align 8 -; ICALL-PROM: [[BITCAST:%[0-9]+]] = bitcast i32 ()* %tmp to i8* -; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq i8* [[BITCAST]], bitcast (i32 ()* @func4 to i8*) +; ICALL-PROM: [[CMP:%[0-9]+]] = icmp eq i32 ()* %tmp, @func4 ; ICALL-PROM: br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]] ; ICALL-PROM: if.true.direct_targ: ; ICALL-PROM: [[DIRCALL_RET:%[0-9]+]] = call i32 @func4() diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll index 4def8ce561c0..557a83a75626 100644 --- a/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ b/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -11,16 +11,20 @@ define i32 @fn1() { ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 8, i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP7]], <4 x i32> <i32 6, i32 0, i32 0, i32 0> -; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 0), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 2), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 8, i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> <i32 6, i32 0, i32 0, i32 0> +; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4 ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll deleted file mode 100644 index 5fc0298b6cef..000000000000 --- a/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll +++ /dev/null @@ -1,125 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s - - -;void jumble (int * restrict A, int * restrict B) { - ; int tmp0 = A[10]*A[0]; - ; int tmp1 = A[11]*A[1]; - ; int tmp2 = A[12]*A[3]; - ; int tmp3 = A[13]*A[2]; - ; B[0] = tmp0; - ; B[1] = tmp1; - ; B[2] = tmp2; - ; B[3] = tmp3; - ;} - - - ; Function Attrs: norecurse nounwind uwtable - define void @jumble1(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { -; CHECK-LABEL: @jumble1( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2> -; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: ret void -; -entry: - %arrayidx = getelementptr inbounds i32, i32* %A, i64 10 - %0 = load i32, i32* %arrayidx, align 4 - %1 = load i32, i32* %A, align 4 - %mul = mul nsw i32 %0, %1 - %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 11 - %2 = load i32, i32* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1 - %3 = load i32, i32* %arrayidx3, align 4 - %mul4 = mul nsw i32 %2, %3 - %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 12 - %4 = load i32, i32* %arrayidx5, align 4 - %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 3 - %5 = load i32, i32* %arrayidx6, align 4 - %mul7 = mul nsw i32 %4, %5 - %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 13 - %6 = load i32, i32* %arrayidx8, align 4 - %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 2 - %7 = load i32, i32* %arrayidx9, align 4 - %mul10 = mul nsw i32 %6, %7 - store i32 %mul, i32* %B, align 4 - %arrayidx12 = getelementptr inbounds i32, i32* %B, i64 1 - store i32 %mul4, i32* %arrayidx12, align 4 - %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 2 - store i32 %mul7, i32* %arrayidx13, align 4 - %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 3 - store i32 %mul10, i32* %arrayidx14, align 4 - ret void - } - -;Reversing the operand of MUL - ; Function Attrs: norecurse nounwind uwtable - define void @jumble2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) { -; CHECK-LABEL: @jumble2( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2> -; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP1]] -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: ret void -; -entry: - %arrayidx = getelementptr inbounds i32, i32* %A, i64 10 - %0 = load i32, i32* %arrayidx, align 4 - %1 = load i32, i32* %A, align 4 - %mul = mul nsw i32 %1, %0 - %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 11 - %2 = load i32, i32* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1 - %3 = load i32, i32* %arrayidx3, align 4 - %mul4 = mul nsw i32 %3, %2 - %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 12 - %4 = load i32, i32* %arrayidx5, align 4 - %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 3 - %5 = load i32, i32* %arrayidx6, align 4 - %mul7 = mul nsw i32 %5, %4 - %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 13 - %6 = load i32, i32* %arrayidx8, align 4 - %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 2 - %7 = load i32, i32* %arrayidx9, align 4 - %mul10 = mul nsw i32 %7, %6 - store i32 %mul, i32* %B, align 4 - %arrayidx12 = getelementptr inbounds i32, i32* %B, i64 1 - store i32 %mul4, i32* %arrayidx12, align 4 - %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 2 - store i32 %mul7, i32* %arrayidx13, align 4 - %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 3 - store i32 %mul10, i32* %arrayidx14, align 4 - ret void - } - diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll deleted file mode 100644 index 568fd9f3ac79..000000000000 --- a/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll +++ /dev/null @@ -1,225 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s - -;void phiUsingLoads(int *restrict A, int *restrict B) { -; int tmp0, tmp1, tmp2, tmp3; -; for (int i = 0; i < 100; i++) { -; if (A[0] == 0) { -; tmp0 = A[i + 0]; -; tmp1 = A[i + 1]; -; tmp2 = A[i + 2]; -; tmp3 = A[i + 3]; -; } else if (A[25] == 0) { -; tmp0 = A[i + 0]; -; tmp1 = A[i + 1]; -; tmp2 = A[i + 2]; -; tmp3 = A[i + 3]; -; } else if (A[50] == 0) { -; tmp0 = A[i + 0]; -; tmp1 = A[i + 1]; -; tmp2 = A[i + 2]; -; tmp3 = A[i + 3]; -; } else if (A[75] == 0) { -; tmp0 = A[i + 0]; -; tmp1 = A[i + 1]; -; tmp2 = A[i + 3]; -; tmp3 = A[i + 2]; -; } -; } -; B[0] = tmp0; -; B[1] = tmp1; -; B[2] = tmp2; -; B[3] = tmp3; -;} - - -; Function Attrs: norecurse nounwind uwtable -define void @phiUsingLoads(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) local_unnamed_addr #0 { -; CHECK-LABEL: @phiUsingLoads( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 25 -; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 50 -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 75 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; CHECK-NEXT: [[ARRAYIDX66:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[B]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP27:%.*]], <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: ret void -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i32> [ undef, [[ENTRY]] ], [ [[TMP27]], [[FOR_INC]] ] -; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] -; CHECK: if.then: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>* -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: br label [[FOR_INC]] -; CHECK: if.else: -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[CMP13:%.*]] = icmp eq i32 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[CMP13]], label [[IF_THEN14:%.*]], label [[IF_ELSE27:%.*]] -; CHECK: if.then14: -; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX17]] to <4 x i32>* -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4 -; CHECK-NEXT: br label [[FOR_INC]] -; CHECK: if.else27: -; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX28]], align 4 -; CHECK-NEXT: [[CMP29:%.*]] = icmp eq i32 [[TMP14]], 0 -; CHECK-NEXT: br i1 [[CMP29]], label [[IF_THEN30:%.*]], label [[IF_ELSE43:%.*]] -; CHECK: if.then30: -; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX42:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX33]] to <4 x i32>* -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4 -; CHECK-NEXT: br label [[FOR_INC]] -; CHECK: if.else43: -; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX44]], align 4 -; CHECK-NEXT: [[CMP45:%.*]] = icmp eq i32 [[TMP20]], 0 -; CHECK-NEXT: br i1 [[CMP45]], label [[IF_THEN46:%.*]], label [[FOR_INC]] -; CHECK: if.then46: -; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX55:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP23]] -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX49]] to <4 x i32>* -; CHECK-NEXT: [[TMP25:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4 -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2> -; CHECK-NEXT: br label [[FOR_INC]] -; CHECK: for.inc: -; CHECK-NEXT: [[TMP27]] = phi <4 x i32> [ [[TMP7]], [[IF_THEN]] ], [ [[TMP13]], [[IF_THEN14]] ], [ [[TMP19]], [[IF_THEN30]] ], [ [[TMP26]], [[IF_THEN46]] ], [ [[TMP2]], [[IF_ELSE43]] ] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] -; -entry: - %0 = load i32, i32* %A, align 4 - %cmp1 = icmp eq i32 %0, 0 - %arrayidx12 = getelementptr inbounds i32, i32* %A, i64 25 - %arrayidx28 = getelementptr inbounds i32, i32* %A, i64 50 - %arrayidx44 = getelementptr inbounds i32, i32* %A, i64 75 - br label %for.body - -for.cond.cleanup: ; preds = %for.inc - store i32 %tmp0.1, i32* %B, align 4 - %arrayidx64 = getelementptr inbounds i32, i32* %B, i64 1 - store i32 %tmp1.1, i32* %arrayidx64, align 4 - %arrayidx65 = getelementptr inbounds i32, i32* %B, i64 2 - store i32 %tmp2.1, i32* %arrayidx65, align 4 - %arrayidx66 = getelementptr inbounds i32, i32* %B, i64 3 - store i32 %tmp3.1, i32* %arrayidx66, align 4 - ret void - -for.body: ; preds = %for.inc, %entry - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] - %tmp3.0111 = phi i32 [ undef, %entry ], [ %tmp3.1, %for.inc ] - %tmp2.0110 = phi i32 [ undef, %entry ], [ %tmp2.1, %for.inc ] - %tmp1.0109 = phi i32 [ undef, %entry ], [ %tmp1.1, %for.inc ] - %tmp0.0108 = phi i32 [ undef, %entry ], [ %tmp0.1, %for.inc ] - br i1 %cmp1, label %if.then, label %if.else - -if.then: ; preds = %for.body - %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - %1 = load i32, i32* %arrayidx2, align 4 - %2 = add nuw nsw i64 %indvars.iv, 1 - %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %2 - %3 = load i32, i32* %arrayidx5, align 4 - %4 = add nuw nsw i64 %indvars.iv, 2 - %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %4 - %5 = load i32, i32* %arrayidx8, align 4 - %6 = add nuw nsw i64 %indvars.iv, 3 - %arrayidx11 = getelementptr inbounds i32, i32* %A, i64 %6 - %7 = load i32, i32* %arrayidx11, align 4 - br label %for.inc - -if.else: ; preds = %for.body - %8 = load i32, i32* %arrayidx12, align 4 - %cmp13 = icmp eq i32 %8, 0 - br i1 %cmp13, label %if.then14, label %if.else27 - -if.then14: ; preds = %if.else - %arrayidx17 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - %9 = load i32, i32* %arrayidx17, align 4 - %10 = add nuw nsw i64 %indvars.iv, 1 - %arrayidx20 = getelementptr inbounds i32, i32* %A, i64 %10 - %11 = load i32, i32* %arrayidx20, align 4 - %12 = add nuw nsw i64 %indvars.iv, 2 - %arrayidx23 = getelementptr inbounds i32, i32* %A, i64 %12 - %13 = load i32, i32* %arrayidx23, align 4 - %14 = add nuw nsw i64 %indvars.iv, 3 - %arrayidx26 = getelementptr inbounds i32, i32* %A, i64 %14 - %15 = load i32, i32* %arrayidx26, align 4 - br label %for.inc - -if.else27: ; preds = %if.else - %16 = load i32, i32* %arrayidx28, align 4 - %cmp29 = icmp eq i32 %16, 0 - br i1 %cmp29, label %if.then30, label %if.else43 - -if.then30: ; preds = %if.else27 - %arrayidx33 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - %17 = load i32, i32* %arrayidx33, align 4 - %18 = add nuw nsw i64 %indvars.iv, 1 - %arrayidx36 = getelementptr inbounds i32, i32* %A, i64 %18 - %19 = load i32, i32* %arrayidx36, align 4 - %20 = add nuw nsw i64 %indvars.iv, 2 - %arrayidx39 = getelementptr inbounds i32, i32* %A, i64 %20 - %21 = load i32, i32* %arrayidx39, align 4 - %22 = add nuw nsw i64 %indvars.iv, 3 - %arrayidx42 = getelementptr inbounds i32, i32* %A, i64 %22 - %23 = load i32, i32* %arrayidx42, align 4 - br label %for.inc - -if.else43: ; preds = %if.else27 - %24 = load i32, i32* %arrayidx44, align 4 - %cmp45 = icmp eq i32 %24, 0 - br i1 %cmp45, label %if.then46, label %for.inc - -if.then46: ; preds = %if.else43 - %arrayidx49 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv - %25 = load i32, i32* %arrayidx49, align 4 - %26 = add nuw nsw i64 %indvars.iv, 1 - %arrayidx52 = getelementptr inbounds i32, i32* %A, i64 %26 - %27 = load i32, i32* %arrayidx52, align 4 - %28 = add nuw nsw i64 %indvars.iv, 3 - %arrayidx55 = getelementptr inbounds i32, i32* %A, i64 %28 - %29 = load i32, i32* %arrayidx55, align 4 - %30 = add nuw nsw i64 %indvars.iv, 2 - %arrayidx58 = getelementptr inbounds i32, i32* %A, i64 %30 - %31 = load i32, i32* %arrayidx58, align 4 - br label %for.inc - -for.inc: ; preds = %if.then, %if.then30, %if.else43, %if.then46, %if.then14 - %tmp0.1 = phi i32 [ %1, %if.then ], [ %9, %if.then14 ], [ %17, %if.then30 ], [ %25, %if.then46 ], [ %tmp0.0108, %if.else43 ] - %tmp1.1 = phi i32 [ %3, %if.then ], [ %11, %if.then14 ], [ %19, %if.then30 ], [ %27, %if.then46 ], [ %tmp1.0109, %if.else43 ] - %tmp2.1 = phi i32 [ %5, %if.then ], [ %13, %if.then14 ], [ %21, %if.then30 ], [ %29, %if.then46 ], [ %tmp2.0110, %if.else43 ] - %tmp3.1 = phi i32 [ %7, %if.then ], [ %15, %if.then14 ], [ %23, %if.then30 ], [ %31, %if.then46 ], [ %tmp3.0111, %if.else43 ] - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 100 - br i1 %exitcond, label %for.cond.cleanup, label %for.body -} diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll index be58521ed898..06e051a90b0d 100644 --- a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll +++ b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll @@ -5,27 +5,34 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) { ; CHECK-LABEL: @jumbled-load( -; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0 +; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* %in, i64 0 +; CHECK-NEXT: [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 +; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1 +; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0> -; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 +; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4 +; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* %inn, i64 0 +; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 +; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 +; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2> -; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]] -; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 -; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 -; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 -; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[LOAD_3]], [[LOAD_5]] +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_8]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LOAD_4]], [[LOAD_7]] +; CHECK-NEXT: [[MUL_4:%.*]] = mul i32 [[LOAD_1]], [[LOAD_6]] +; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* %out, i64 0 +; CHECK-NEXT: store i32 [[MUL_1]], i32* [[GEP_7]], align 4 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* %out, i64 1 +; CHECK-NEXT: store i32 [[MUL_2]], i32* [[GEP_8]], align 4 +; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* %out, i64 2 +; CHECK-NEXT: store i32 [[MUL_3]], i32* [[GEP_9]], align 4 +; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* %out, i64 3 +; CHECK-NEXT: store i32 [[MUL_4]], i32* [[GEP_10]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 diff --git a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll index 6ae763520013..1b2c76384e0b 100644 --- a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll +++ b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll @@ -6,26 +6,33 @@ define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) { ; CHECK-LABEL: @jumbled-load( ; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0 +; CHECK-NEXT: [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1 +; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 +; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2> +; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4 ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 +; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 +; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 +; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2> -; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]] +; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 +; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]] +; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_6]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]] +; CHECK-NEXT: [[MUL_4:%.*]] = mul i32 [[LOAD_4]], [[LOAD_8]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: store i32 [[MUL_1]], i32* [[GEP_9]], align 4 +; CHECK-NEXT: store i32 [[MUL_2]], i32* [[GEP_7]], align 4 +; CHECK-NEXT: store i32 [[MUL_3]], i32* [[GEP_10]], align 4 +; CHECK-NEXT: store i32 [[MUL_4]], i32* [[GEP_8]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 diff --git a/test/Transforms/SampleProfile/entry_counts.ll b/test/Transforms/SampleProfile/entry_counts.ll index 6137a6908cf5..cab7c87e0493 100644 --- a/test/Transforms/SampleProfile/entry_counts.ll +++ b/test/Transforms/SampleProfile/entry_counts.ll @@ -9,8 +9,8 @@ entry: ret void, !dbg !9 } -; This function does not have profile, check if function_entry_count is 0 -; CHECK: {{.*}} = !{!"function_entry_count", i64 0} +; This function does not have profile, check if function_entry_count is -1 +; CHECK: {{.*}} = !{!"function_entry_count", i64 -1} define void @no_profile() { entry: ret void diff --git a/test/Transforms/SimplifyCFG/X86/if-conversion.ll b/test/Transforms/SimplifyCFG/X86/if-conversion.ll new file mode 100644 index 000000000000..28702572d480 --- /dev/null +++ b/test/Transforms/SimplifyCFG/X86/if-conversion.ll @@ -0,0 +1,231 @@ +; RUN: opt < %s -simplifycfg -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -S | FileCheck %s +; Avoid if-conversion if there is a long dependence chain. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; The first several cases test FindLongDependenceChain returns true, so +; if-conversion is blocked. + +define i64 @test1(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = or i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + ret i64 %val + +; CHECK-NOT: select +} + +define i64 @test2(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = add i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + ret i64 %val + +; CHECK-LABEL: @test2 +; CHECK-NOT: select +} + +; The following cases test FindLongDependenceChain returns false, so +; if-conversion will proceed. + +; Non trivial LatencyAdjustment. +define i64 @test3(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = or i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %p4 = add i64 %p3, %1 + %ptr = inttoptr i64 %p4 to i64* + %val = load i64, i64* %ptr, align 8 + ret i64 %val + +; CHECK-LABEL: @test3 +; CHECK: select +} + +; Short dependence chain. +define i64 @test4(i64* %pp, i64* %p) { +entry: + %0 = load i64, i64* %pp, align 8 + %cmp = icmp slt i64 %0, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = or i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + ret i64 %val + +; CHECK-LABEL: @test4 +; CHECK: select +} + +; High IPC. +define i64 @test5(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + %2 = add i64 %pint, 2 + %3 = add i64 %pint, 3 + %4 = or i64 %pint, 16 + %5 = and i64 %pint, 255 + + %6 = or i64 %2, 9 + %7 = and i64 %3, 255 + %8 = add i64 %4, 4 + %9 = add i64 %5, 5 + + %10 = add i64 %6, 2 + %11 = add i64 %7, 3 + %12 = add i64 %8, 4 + %13 = add i64 %9, 5 + + %14 = add i64 %10, 6 + %15 = add i64 %11, 7 + %16 = add i64 %12, 8 + %17 = add i64 %13, 9 + + %18 = add i64 %14, 10 + %19 = add i64 %15, 11 + %20 = add i64 %16, 12 + %21 = add i64 %17, 13 + + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = or i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + + ret i64 %val + +; CHECK-LABEL: @test5 +; CHECK: select +} + +; Large BB size. +define i64 @test6(i64** %pp, i64* %p) { +entry: + %0 = load i64*, i64** %pp, align 8 + %1 = load i64, i64* %0, align 8 + %cmp = icmp slt i64 %1, 0 + %pint = ptrtoint i64* %p to i64 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: + %p1 = add i64 %pint, 8 + br label %cond.end + +cond.false: + %p2 = or i64 %pint, 16 + br label %cond.end + +cond.end: + %p3 = phi i64 [%p1, %cond.true], [%p2, %cond.false] + %ptr = inttoptr i64 %p3 to i64* + %val = load i64, i64* %ptr, align 8 + %2 = add i64 %pint, 2 + %3 = add i64 %pint, 3 + %4 = add i64 %2, 4 + %5 = add i64 %3, 5 + %6 = add i64 %4, 6 + %7 = add i64 %5, 7 + %8 = add i64 %6, 6 + %9 = add i64 %7, 7 + %10 = add i64 %8, 6 + %11 = add i64 %9, 7 + %12 = add i64 %10, 6 + %13 = add i64 %11, 7 + %14 = add i64 %12, 6 + %15 = add i64 %13, 7 + %16 = add i64 %14, 6 + %17 = add i64 %15, 7 + %18 = add i64 %16, 6 + %19 = add i64 %17, 7 + %20 = add i64 %18, 6 + %21 = add i64 %19, 7 + %22 = add i64 %20, 6 + %23 = add i64 %21, 7 + %24 = add i64 %22, 6 + %25 = add i64 %23, 7 + %26 = add i64 %24, 6 + %27 = add i64 %25, 7 + %28 = add i64 %26, 6 + %29 = add i64 %27, 7 + %30 = add i64 %28, 6 + %31 = add i64 %29, 7 + %32 = add i64 %30, 8 + %33 = add i64 %31, 9 + %34 = add i64 %32, %33 + %35 = and i64 %34, 255 + %res = add i64 %val, %35 + + ret i64 %res + +; CHECK-LABEL: @test6 +; CHECK: select +} diff --git a/test/tools/llvm-cvtres/machine.test b/test/tools/llvm-cvtres/machine.test index cac36062f301..5ce6cbaf697d 100644 --- a/test/tools/llvm-cvtres/machine.test +++ b/test/tools/llvm-cvtres/machine.test @@ -29,46 +29,46 @@ X86: Machine: IMAGE_FILE_MACHINE_I386 (0x14C) X86-DAG: Relocations [ X86-DAG: .rsrc$01 { X86-NEXT: 0x1E8 IMAGE_REL_I386_DIR32NB $R000000 -X86-NEXT: 0x198 IMAGE_REL_I386_DIR32NB $R000018 -X86-NEXT: 0x1A8 IMAGE_REL_I386_DIR32NB $R000340 -X86-NEXT: 0x1C8 IMAGE_REL_I386_DIR32NB $R000668 -X86-NEXT: 0x1D8 IMAGE_REL_I386_DIR32NB $R000698 -X86-NEXT: 0x1F8 IMAGE_REL_I386_DIR32NB $R000708 -X86-NEXT: 0x1B8 IMAGE_REL_I386_DIR32NB $R000720 -X86-NEXT: 0x188 IMAGE_REL_I386_DIR32NB $R000750 +X86-NEXT: 0x198 IMAGE_REL_I386_DIR32NB $R000001 +X86-NEXT: 0x1A8 IMAGE_REL_I386_DIR32NB $R000002 +X86-NEXT: 0x1C8 IMAGE_REL_I386_DIR32NB $R000003 +X86-NEXT: 0x1D8 IMAGE_REL_I386_DIR32NB $R000004 +X86-NEXT: 0x1F8 IMAGE_REL_I386_DIR32NB $R000005 +X86-NEXT: 0x1B8 IMAGE_REL_I386_DIR32NB $R000006 +X86-NEXT: 0x188 IMAGE_REL_I386_DIR32NB $R000007 X64: Machine: IMAGE_FILE_MACHINE_AMD64 (0x8664) X64-DAG: Relocations [ X64-DAG: .rsrc$01 { X64-NEXT: 0x1E8 IMAGE_REL_AMD64_ADDR32NB $R000000 -X64-NEXT: 0x198 IMAGE_REL_AMD64_ADDR32NB $R000018 -X64-NEXT: 0x1A8 IMAGE_REL_AMD64_ADDR32NB $R000340 -X64-NEXT: 0x1C8 IMAGE_REL_AMD64_ADDR32NB $R000668 -X64-NEXT: 0x1D8 IMAGE_REL_AMD64_ADDR32NB $R000698 -X64-NEXT: 0x1F8 IMAGE_REL_AMD64_ADDR32NB $R000708 -X64-NEXT: 0x1B8 IMAGE_REL_AMD64_ADDR32NB $R000720 -X64-NEXT: 0x188 IMAGE_REL_AMD64_ADDR32NB $R000750 +X64-NEXT: 0x198 IMAGE_REL_AMD64_ADDR32NB $R000001 +X64-NEXT: 0x1A8 IMAGE_REL_AMD64_ADDR32NB $R000002 +X64-NEXT: 0x1C8 IMAGE_REL_AMD64_ADDR32NB $R000003 +X64-NEXT: 0x1D8 IMAGE_REL_AMD64_ADDR32NB $R000004 +X64-NEXT: 0x1F8 IMAGE_REL_AMD64_ADDR32NB $R000005 +X64-NEXT: 0x1B8 IMAGE_REL_AMD64_ADDR32NB $R000006 +X64-NEXT: 0x188 IMAGE_REL_AMD64_ADDR32NB $R000007 ARM: Machine: IMAGE_FILE_MACHINE_ARMNT (0x1C4) ARM-DAG: Relocations [ ARM-DAG: .rsrc$01 { ARM-NEXT: 0x1E8 IMAGE_REL_ARM_ADDR32NB $R000000 -ARM-NEXT: 0x198 IMAGE_REL_ARM_ADDR32NB $R000018 -ARM-NEXT: 0x1A8 IMAGE_REL_ARM_ADDR32NB $R000340 -ARM-NEXT: 0x1C8 IMAGE_REL_ARM_ADDR32NB $R000668 -ARM-NEXT: 0x1D8 IMAGE_REL_ARM_ADDR32NB $R000698 -ARM-NEXT: 0x1F8 IMAGE_REL_ARM_ADDR32NB $R000708 -ARM-NEXT: 0x1B8 IMAGE_REL_ARM_ADDR32NB $R000720 -ARM-NEXT: 0x188 IMAGE_REL_ARM_ADDR32NB $R000750 +ARM-NEXT: 0x198 IMAGE_REL_ARM_ADDR32NB $R000001 +ARM-NEXT: 0x1A8 IMAGE_REL_ARM_ADDR32NB $R000002 +ARM-NEXT: 0x1C8 IMAGE_REL_ARM_ADDR32NB $R000003 +ARM-NEXT: 0x1D8 IMAGE_REL_ARM_ADDR32NB $R000004 +ARM-NEXT: 0x1F8 IMAGE_REL_ARM_ADDR32NB $R000005 +ARM-NEXT: 0x1B8 IMAGE_REL_ARM_ADDR32NB $R000006 +ARM-NEXT: 0x188 IMAGE_REL_ARM_ADDR32NB $R000007 ARM64: Machine: IMAGE_FILE_MACHINE_ARM64 (0xAA64) ARM64-DAG: Relocations [ ARM64-DAG: .rsrc$01 { ARM64-NEXT: 0x1E8 IMAGE_REL_ARM64_ADDR32NB $R000000 -ARM64-NEXT: 0x198 IMAGE_REL_ARM64_ADDR32NB $R000018 -ARM64-NEXT: 0x1A8 IMAGE_REL_ARM64_ADDR32NB $R000340 -ARM64-NEXT: 0x1C8 IMAGE_REL_ARM64_ADDR32NB $R000668 -ARM64-NEXT: 0x1D8 IMAGE_REL_ARM64_ADDR32NB $R000698 -ARM64-NEXT: 0x1F8 IMAGE_REL_ARM64_ADDR32NB $R000708 -ARM64-NEXT: 0x1B8 IMAGE_REL_ARM64_ADDR32NB $R000720 -ARM64-NEXT: 0x188 IMAGE_REL_ARM64_ADDR32NB $R000750 +ARM64-NEXT: 0x198 IMAGE_REL_ARM64_ADDR32NB $R000001 +ARM64-NEXT: 0x1A8 IMAGE_REL_ARM64_ADDR32NB $R000002 +ARM64-NEXT: 0x1C8 IMAGE_REL_ARM64_ADDR32NB $R000003 +ARM64-NEXT: 0x1D8 IMAGE_REL_ARM64_ADDR32NB $R000004 +ARM64-NEXT: 0x1F8 IMAGE_REL_ARM64_ADDR32NB $R000005 +ARM64-NEXT: 0x1B8 IMAGE_REL_ARM64_ADDR32NB $R000006 +ARM64-NEXT: 0x188 IMAGE_REL_ARM64_ADDR32NB $R000007 diff --git a/test/tools/llvm-cvtres/symbols.test b/test/tools/llvm-cvtres/symbols.test index 2ca3a193ac40..14f5c360d454 100644 --- a/test/tools/llvm-cvtres/symbols.test +++ b/test/tools/llvm-cvtres/symbols.test @@ -13,21 +13,21 @@ RUN: llvm-readobj -symbols %t | FileCheck %s CHECK: Name: $R000000 CHECK-NEXT: Value: 0 CHECK-NEXT: Section: .rsrc$02 -CHECK: Name: $R000018 +CHECK: Name: $R000001 CHECK-NEXT: Value: 24 CHECK-NEXT: Section: .rsrc$02 -CHECK: Name: $R000340 +CHECK: Name: $R000002 CHECK-NEXT: Value: 832 CHECK-NEXT: Section: .rsrc$02 -CHECK: Name: $R000668 +CHECK: Name: $R000003 CHECK-NEXT: Value: 1640 CHECK-NEXT: Section: .rsrc$02 -CHECK: Name: $R000698 +CHECK: Name: $R000004 CHECK-NEXT: Value: 1688 CHECK-NEXT: Section: .rsrc$02 -CHECK: Name: $R000720 +CHECK: Name: $R000006 CHECK-NEXT: Value: 1824 CHECK-NEXT: Section: .rsrc$02 -CHECK: Name: $R000750 +CHECK: Name: $R000007 CHECK-NEXT: Value: 1872 CHECK-NEXT: Section: .rsrc$02 diff --git a/test/tools/llvm-dwarfdump/X86/lookup.s b/test/tools/llvm-dwarfdump/X86/lookup.s index 652844447c11..d09528d667fa 100644 --- a/test/tools/llvm-dwarfdump/X86/lookup.s +++ b/test/tools/llvm-dwarfdump/X86/lookup.s @@ -1,5 +1,10 @@ # RUN: llvm-mc %s -filetype obj -triple x86_64-apple-darwin -o - \ -# RUN: | llvm-dwarfdump -lookup=0x7fffffff - | \ +# RUN: | llvm-dwarfdump -lookup=0xffffffff - | \ +# RUN: FileCheck %s --check-prefix=EMPTY --allow-empty +# EMPTY: {{^$}} + +# RUN: llvm-mc %s -filetype obj -triple x86_64-apple-darwin -o - \ +# RUN: | llvm-dwarfdump -lookup=0xffffffffffffffff - | \ # RUN: FileCheck %s --check-prefix=EMPTY --allow-empty # EMPTY: {{^$}} diff --git a/test/tools/llvm-objcopy/add-section-remove.test b/test/tools/llvm-objcopy/add-section-remove.test new file mode 100644 index 000000000000..0dee1182a28d --- /dev/null +++ b/test/tools/llvm-objcopy/add-section-remove.test @@ -0,0 +1,36 @@ +# RUN: yaml2obj %s > %t +# RUN: echo 0000 > %t.sec +# RUN: llvm-objcopy -R=.test2 -add-section=.test2=%t.sec %t %t2 +# RUN: llvm-readobj -file-headers -sections -section-data %t2 | FileCheck %s + +!ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_X86_64 +Sections: + - Name: .test1 + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Content: "c3c3c3c3" + - Name: .test2 + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Content: "DEADBEEF" + - Name: .test3 + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Content: "32323232" + +# CHECK: SectionHeaderCount: 7 + +# CHECK: Name: .test1 +# CHECK: Name: .test3 +# CHECK: Name: .symtab +# CHECK: Name: .strtab +# CHECK: Name: .shstrtab +# CHECK: Name: .test2 +# CHECK: SectionData ( +# CHECK-NEXT: 0000: 30303030 +# CHECK-NEXT: ) diff --git a/test/tools/llvm-objcopy/add-section.test b/test/tools/llvm-objcopy/add-section.test new file mode 100644 index 000000000000..048edcba227f --- /dev/null +++ b/test/tools/llvm-objcopy/add-section.test @@ -0,0 +1,37 @@ +# RUN: yaml2obj %s > %t +# RUN: llvm-objcopy -O binary -j .test2 %t %t.sec +# RUN: llvm-objcopy -R=.test2 %t %t2 +# RUN: llvm-objcopy -add-section=.test2=%t.sec %t2 %t3 +# RUN: llvm-readobj -file-headers -sections -section-data %t3 | FileCheck %s + +!ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_X86_64 +Sections: + - Name: .test1 + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Content: "c3c3c3c3" + - Name: .test2 + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Content: "DEADBEEF" + - Name: .test3 + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Content: "32323232" + +# CHECK: SectionHeaderCount: 7 + +# CHECK: Name: .test1 +# CHECK: Name: .test3 +# CHECK: Name: .symtab +# CHECK: Name: .strtab +# CHECK: Name: .shstrtab +# CHECK: Name: .test2 +# CHECK: SectionData ( +# CHECK-NEXT: 0000: DEADBEEF +# CHECK-NEXT: ) diff --git a/test/tools/llvm-readobj/mips-got.test b/test/tools/llvm-readobj/mips-got.test index 65ccf13f2b4c..a5c15fdfe230 100644 --- a/test/tools/llvm-readobj/mips-got.test +++ b/test/tools/llvm-readobj/mips-got.test @@ -1,4 +1,4 @@ -RUN: llvm-readobj -mips-plt-got %p/Inputs/relocs.obj.elf-mips | \ +RUN: not llvm-readobj -mips-plt-got %p/Inputs/relocs.obj.elf-mips 2>&1 | \ RUN: FileCheck %s -check-prefix GOT-OBJ RUN: llvm-readobj -mips-plt-got %p/Inputs/dynamic-table-exe.mips | \ RUN: FileCheck %s -check-prefix GOT-EXE @@ -11,7 +11,26 @@ RUN: FileCheck %s -check-prefix GOT-EMPTY RUN: llvm-readobj -mips-plt-got %p/Inputs/got-static.exe.mips | \ RUN: FileCheck %s -check-prefix GOT-STATIC -GOT-OBJ: Cannot find .got section +RUN: not llvm-readobj -mips-plt-got %p/Inputs/relocs.obj.elf-mips \ +RUN: --elf-output-style=GNU 2>&1 | \ +RUN: FileCheck %s -check-prefix GNU-GOT-OBJ +RUN: llvm-readobj -mips-plt-got %p/Inputs/dynamic-table-exe.mips \ +RUN: --elf-output-style=GNU | \ +RUN: FileCheck %s -check-prefix GNU-GOT-EXE +RUN: llvm-readobj -mips-plt-got %p/Inputs/dynamic-table-so.mips \ +RUN: --elf-output-style=GNU | \ +RUN: FileCheck %s -check-prefix GNU-GOT-SO +RUN: llvm-readobj -mips-plt-got %p/Inputs/got-tls.so.elf-mips64el \ +RUN: --elf-output-style=GNU | \ +RUN: FileCheck %s -check-prefix GNU-GOT-TLS +RUN: llvm-readobj -mips-plt-got %p/Inputs/got-empty.exe.mipsel \ +RUN: --elf-output-style=GNU | \ +RUN: FileCheck %s -check-prefix GNU-GOT-EMPTY +RUN: llvm-readobj -mips-plt-got %p/Inputs/got-static.exe.mips \ +RUN: --elf-output-style=GNU | \ +RUN: FileCheck %s -check-prefix GNU-GOT-STATIC + +GOT-OBJ: Error reading file: Cannot find .got section. GOT-EXE: Primary GOT { GOT-EXE-NEXT: Canonical gp value: 0x418880 @@ -366,3 +385,116 @@ GOT-STATIC-NEXT: Initial: 0x400104 GOT-STATIC-NEXT: } GOT-STATIC-NEXT: ] GOT-STATIC-NEXT: } + +GNU-GOT-OBJ: Error reading file: Cannot find .got section. + +GNU-GOT-EXE: Primary GOT: +GNU-GOT-EXE-NEXT: Canonical gp value: 00418880 + +GNU-GOT-EXE: Reserved entries: +GNU-GOT-EXE-NEXT: Address Access Initial Purpose +GNU-GOT-EXE-NEXT: 00410890 -32752(gp) 00000000 Lazy resolver +GNU-GOT-EXE-NEXT: 00410894 -32748(gp) 80000000 Module pointer (GNU extension) + +GNU-GOT-EXE: Local entries: +GNU-GOT-EXE-NEXT: Address Access Initial +GNU-GOT-EXE-NEXT: 00410898 -32744(gp) 00400418 +GNU-GOT-EXE-NEXT: 0041089c -32740(gp) 00410840 +GNU-GOT-EXE-NEXT: 004108a0 -32736(gp) 00000000 + +GNU-GOT-EXE: Global entries: +GNU-GOT-EXE-NEXT: Address Access Initial Sym.Val. Type Ndx Name +GNU-GOT-EXE-NEXT: 004108a4 -32732(gp) 00000000 00000000 FUNC UND __gmon_start__ + +GNU-GOT-EXE: PLT GOT: + +GNU-GOT-EXE: Reserved entries: +GNU-GOT-EXE-NEXT: Address Initial Purpose +GNU-GOT-EXE-NEXT: 00410854 00000000 PLT lazy resolver +GNU-GOT-EXE-NEXT: 00410894 80000000 Module pointer + +GNU-GOT-EXE: Entries: +GNU-GOT-EXE-NEXT: Address Initial Sym.Val. Type Ndx Name +GNU-GOT-EXE-NEXT: 0041085c 00400800 00000000 FUNC UND puts +GNU-GOT-EXE-NEXT: 00410860 00400800 00000000 FUNC UND __libc_start_main + +GNU-GOT-SO: Primary GOT: +GNU-GOT-SO-NEXT: Canonical gp value: 000188d0 + +GNU-GOT-SO: Reserved entries: +GNU-GOT-SO-NEXT: Address Access Initial Purpose +GNU-GOT-SO-NEXT: 000108e0 -32752(gp) 00000000 Lazy resolver +GNU-GOT-SO-NEXT: 000108e4 -32748(gp) 80000000 Module pointer (GNU extension) + +GNU-GOT-SO: Local entries: +GNU-GOT-SO-NEXT: Address Access Initial +GNU-GOT-SO-NEXT: 000108e8 -32744(gp) 000108e0 +GNU-GOT-SO-NEXT: 000108ec -32740(gp) 00010000 +GNU-GOT-SO-NEXT: 000108f0 -32736(gp) 00010920 +GNU-GOT-SO-NEXT: 000108f4 -32732(gp) 000108cc +GNU-GOT-SO-NEXT: 000108f8 -32728(gp) 00000000 +GNU-GOT-SO-NEXT: 000108fc -32724(gp) 00000000 +GNU-GOT-SO-NEXT: 00010900 -32720(gp) 00000000 +GNU-GOT-SO-NEXT: 00010904 -32716(gp) 00000000 + +GNU-GOT-SO: Global entries: +GNU-GOT-SO-NEXT: Address Access Initial Sym.Val. Type Ndx Name +GNU-GOT-SO-NEXT: 00010908 -32712(gp) 00000000 00000000 NOTYPE UND _ITM_registerTMCloneTable +GNU-GOT-SO-NEXT: 0001090c -32708(gp) 00000000 00000000 NOTYPE UND _Jv_RegisterClasses +GNU-GOT-SO-NEXT: 00010910 -32704(gp) 00000000 00000000 FUNC UND __gmon_start__ +GNU-GOT-SO-NEXT: 00010914 -32700(gp) 00000840 00000840 FUNC UND puts +GNU-GOT-SO-NEXT: 00010918 -32696(gp) 00000000 00000000 NOTYPE UND _ITM_deregisterTMCloneTable +GNU-GOT-SO-NEXT: 0001091c -32692(gp) 00000000 00000000 FUNC UND __cxa_finalize + +GNU-GOT-TLS: Primary GOT: +GNU-GOT-TLS-NEXT: Canonical gp value: 0000000000018bf0 + +GNU-GOT-TLS: Reserved entries: +GNU-GOT-TLS-NEXT: Address Access Initial Purpose +GNU-GOT-TLS-NEXT: 0000000000010c00 -32752(gp) 0000000000000000 Lazy resolver +GNU-GOT-TLS-NEXT: 0000000000010c08 -32744(gp) 8000000000000000 Module pointer (GNU extension) + +GNU-GOT-TLS: Local entries: +GNU-GOT-TLS-NEXT: Address Access Initial +GNU-GOT-TLS-NEXT: 0000000000010c10 -32736(gp) 0000000000010000 +GNU-GOT-TLS-NEXT: 0000000000010c18 -32728(gp) 0000000000010c00 +GNU-GOT-TLS-NEXT: 0000000000010c20 -32720(gp) 0000000000010cb8 +GNU-GOT-TLS-NEXT: 0000000000010c28 -32712(gp) 0000000000010bf0 +GNU-GOT-TLS-NEXT: 0000000000010c30 -32704(gp) 0000000000000000 +GNU-GOT-TLS-NEXT: 0000000000010c38 -32696(gp) 0000000000000948 +GNU-GOT-TLS-NEXT: 0000000000010c40 -32688(gp) 0000000000000a20 +GNU-GOT-TLS-NEXT: 0000000000010c48 -32680(gp) 0000000000000af0 +GNU-GOT-TLS-NEXT: 0000000000010c50 -32672(gp) 0000000000000000 +GNU-GOT-TLS-NEXT: 0000000000010c58 -32664(gp) 0000000000000000 +GNU-GOT-TLS-NEXT: 0000000000010c60 -32656(gp) 0000000000000000 + +GNU-GOT-TLS: Global entries: +GNU-GOT-TLS-NEXT: Address Access Initial Sym.Val. Type Ndx Name +GNU-GOT-TLS-NEXT: 0000000000010c68 -32648(gp) 0000000000000000 0000000000000000 NOTYPE UND _ITM_registerTMCloneTable +GNU-GOT-TLS-NEXT: 0000000000010c70 -32640(gp) 0000000000000000 0000000000000000 NOTYPE UND _Jv_RegisterClasses +GNU-GOT-TLS-NEXT: 0000000000010c78 -32632(gp) 0000000000000000 0000000000000000 FUNC UND __gmon_start__ +GNU-GOT-TLS-NEXT: 0000000000010c80 -32624(gp) 0000000000000b60 0000000000000b60 FUNC UND __tls_get_addr +GNU-GOT-TLS-NEXT: 0000000000010c88 -32616(gp) 0000000000000000 0000000000000000 NOTYPE UND _ITM_deregisterTMCloneTable +GNU-GOT-TLS-NEXT: 0000000000010c90 -32608(gp) 0000000000000000 0000000000000000 FUNC UND __cxa_finalize + +GNU-GOTY : Primary GOT: +GNU-GOT-EMPTY: Canonical gp value: 00409ff0 + +GNU-GOTY : Reserved entries: +GNU-GOT-EMPTY: Address Access Initial Purpose +GNU-GOT-EMPTY: 00402000 -32752(gp) 00000000 Lazy resolver +GNU-GOT-EMPTY: 00402004 -32748(gp) 80000000 Module pointer (GNU extension) + +GNU-GOT-STATIC: Static GOT: +GNU-GOT-STATIC-NEXT: Canonical gp value: 00418100 + +GNU-GOT-STATIC: Reserved entries: +GNU-GOT-STATIC-NEXT: Address Access Initial Purpose +GNU-GOT-STATIC-NEXT: 00410110 -32752(gp) 00000000 Lazy resolver +GNU-GOT-STATIC-NEXT: 00410114 -32748(gp) 80000000 Module pointer (GNU extension) + +GNU-GOT-STATIC: Local entries: +GNU-GOT-STATIC-NEXT: Address Access Initial +GNU-GOT-STATIC-NEXT: 00410118 -32744(gp) 00400000 +GNU-GOT-STATIC-NEXT: 0041011c -32740(gp) 00400100 +GNU-GOT-STATIC-NEXT: 00410120 -32736(gp) 00400104 diff --git a/test/tools/llvm-readobj/mips-plt.test b/test/tools/llvm-readobj/mips-plt.test index ab0824b0be68..f41940c9cf34 100644 --- a/test/tools/llvm-readobj/mips-plt.test +++ b/test/tools/llvm-readobj/mips-plt.test @@ -1,4 +1,7 @@ RUN: llvm-readobj -mips-plt-got %p/Inputs/got-plt.exe.elf-mipsel | FileCheck %s +RUN: llvm-readobj -mips-plt-got --elf-output-style=GNU \ +RUN: %p/Inputs/got-plt.exe.elf-mipsel \ +RUN: | FileCheck --check-prefix=GNU %s CHECK: PLT GOT { CHECK-NEXT: Reserved entries [ @@ -32,3 +35,32 @@ CHECK-NEXT: Name: __libc_start_main@GLIBC_2.0 (53) CHECK-NEXT: } CHECK-NEXT: ] CHECK-NEXT: } + +GNU: Primary GOT: +GNU-NEXT: Canonical gp value: 00418840 + +GNU: Reserved entries: +GNU-NEXT: Address Access Initial Purpose +GNU-NEXT: 00410850 -32752(gp) 00000000 Lazy resolver +GNU-NEXT: 00410854 -32748(gp) 80000000 Module pointer (GNU extension) + +GNU: Local entries: +GNU-NEXT: Address Access Initial +GNU-NEXT: 00410858 -32744(gp) 004003d4 +GNU-NEXT: 0041085c -32740(gp) 00410800 +GNU-NEXT: 00410860 -32736(gp) 00000000 + +GNU: Global entries: +GNU-NEXT: Address Access Initial Sym.Val. Type Ndx Name +GNU-NEXT: 00410864 -32732(gp) 00000000 00000000 FUNC UND __gmon_start__ +GNU-NEXT: PLT GOT: + +GNU: Reserved entries: +GNU-NEXT: Address Initial Purpose +GNU-NEXT: 00410814 00000000 PLT lazy resolver +GNU-NEXT: 00410854 80000000 Module pointer + +GNU: Entries: +GNU-NEXT: Address Initial Sym.Val. Type Ndx Name +GNU-NEXT: 0041081c 004007c0 00000000 FUNC UND puts +GNU-NEXT: 00410820 004007c0 00000000 FUNC UND __libc_start_main |