diff options
Diffstat (limited to 'test/CodeGen/X86')
474 files changed, 44724 insertions, 10554 deletions
diff --git a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll index 638d399056a2..62c503da35a6 100644 --- a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll +++ b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=i686-pc-linux-gnu | FileCheck %s -@__gthrw_pthread_once = alias weak i32 (i32*, void ()*)* @pthread_once ; <i32 (i32*, void ()*)*> [#uses=0] +@__gthrw_pthread_once = weak alias i32 (i32*, void ()*)* @pthread_once ; <i32 (i32*, void ()*)*> [#uses=0] define weak i32 @pthread_once(i32*, void ()*) { ret i32 0 diff --git a/test/CodeGen/X86/2008-06-18-BadShuffle.ll b/test/CodeGen/X86/2008-06-18-BadShuffle.ll deleted file mode 100644 index 66f9065799e5..000000000000 --- a/test/CodeGen/X86/2008-06-18-BadShuffle.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=i386 -mattr=+sse2 | grep pinsrw - -; Test to make sure we actually insert the bottom element of the vector -define <8 x i16> @a(<8 x i16> %a) nounwind { -entry: - shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> < i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8 > - %add = add <8 x i16> %0, %a - ret <8 x i16> %add -} - diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll index 296f0ca135b8..6c8e3b5a8fdc 100644 --- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll +++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll @@ -14,9 +14,9 @@ entry: %2 = alloca i64 ; <i64*> [#uses=1] %3 = alloca i64 ; <i64*> [#uses=6] %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - call void @llvm.dbg.declare(metadata !{i8** %s1_addr}, metadata !0), !dbg !7 + call void @llvm.dbg.declare(metadata i8** %s1_addr, metadata !0, metadata !{!"0x102"}), !dbg !7 store i8* %s1, i8** %s1_addr - call void @llvm.dbg.declare(metadata !{[0 x i8]** %str.0}, metadata !8), !dbg !7 + call void @llvm.dbg.declare(metadata [0 x i8]** %str.0, metadata !8, metadata !{!"0x102"}), !dbg !7 %4 = call i8* @llvm.stacksave(), !dbg !7 ; <i8*> [#uses=1] store i8* %4, i8** %saved_stack.1, align 8, !dbg !7 %5 = load i8** %s1_addr, align 8, !dbg !13 ; <i8*> [#uses=1] @@ -58,7 +58,7 @@ return: ; preds = %entry ret i8 %retval12, !dbg !16 } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone declare i8* @llvm.stacksave() nounwind @@ -66,22 +66,22 @@ declare i64 @strlen(i8*) nounwind readonly declare void @llvm.stackrestore(i8*) nounwind -!0 = metadata !{i32 459009, metadata !1, metadata !"s1", metadata !2, i32 2, metadata !6} ; [ DW_TAG_arg_variable ] -!1 = metadata !{i32 458798, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] -!2 = metadata !{i32 458769, metadata !17, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!3 = metadata !{i32 458773, null, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!4 = metadata !{metadata !5, metadata !6} -!5 = metadata !{i32 458788, null, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] -!6 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ] -!7 = metadata !{i32 2, i32 0, metadata !1, null} -!8 = metadata !{i32 459008, metadata !1, metadata !"str.0", metadata !2, i32 3, metadata !9} ; [ DW_TAG_auto_variable ] -!9 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ] -!10 = metadata !{i32 458753, null, metadata !2, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !5, metadata !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char] -!11 = metadata !{metadata !12} -!12 = metadata !{i32 458785, i64 0, i64 1} ; [ DW_TAG_subrange_type ] -!13 = metadata !{i32 3, i32 0, metadata !14, null} -!14 = metadata !{i32 458763, metadata !17, metadata !1, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!15 = metadata !{i32 4, i32 0, metadata !14, null} -!16 = metadata !{i32 5, i32 0, metadata !14, null} -!17 = metadata !{metadata !"vla.c", metadata !"/tmp/"} -!18 = metadata !{i32 0} +!0 = !{!"0x101\00s1\002\000", !1, !2, !6} ; [ DW_TAG_arg_variable ] +!1 = !{!"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\000\000", i32 0, !2, !3, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!2 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", !17, !18, !18, null, null, null} ; [ DW_TAG_compile_unit ] +!3 = !{!"0x15\00\000\000\000\000\000\000", null, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!4 = !{!5, !6} +!5 = !{!"0x24\00char\000\008\008\000\000\006", null, !2} ; [ DW_TAG_base_type ] +!6 = !{!"0xf\00\000\0064\0064\000\000", null, !2, !5} ; [ DW_TAG_pointer_type ] +!7 = !MDLocation(line: 2, scope: !1) +!8 = !{!"0x100\00str.0\003\000", !1, !2, !9} ; [ DW_TAG_auto_variable ] +!9 = !{!"0xf\00\000\0064\0064\000\0064", null, !2, !10} ; [ DW_TAG_pointer_type ] +!10 = !{!"0x1\00\000\008\008\000\000", null, !2, !5, !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char] +!11 = !{!12} +!12 = !{!"0x21\000\001"} ; [ DW_TAG_subrange_type ] +!13 = !MDLocation(line: 3, scope: !14) +!14 = !{!"0xb\000\000\000", !17, !1} ; [ DW_TAG_lexical_block ] +!15 = !MDLocation(line: 4, scope: !14) +!16 = !MDLocation(line: 5, scope: !14) +!17 = !{!"vla.c", !"/tmp/"} +!18 = !{i32 0} diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll index 764c2cdd6d99..e046b966921f 100644 --- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll +++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "4 machine-licm" +; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "7 machine-licm" ; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s ; rdar://6627786 ; rdar://7792037 diff --git a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll deleted file mode 100644 index e1930e012dd8..000000000000 --- a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc -mtriple=i386-apple-darwin10.0 -relocation-model=pic -asm-verbose=false \ -; RUN: -mcpu=generic -disable-fp-elim -mattr=-sse4.1,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \ -; RUN: FileCheck %s -; rdar://6808032 - -; CHECK: pextrw $14 -; CHECK-NEXT: shrl $8 -; CHECK-NEXT: pinsrw - -define void @update(i8** %args_list) nounwind { -entry: - %cmp.i = icmp eq i32 0, 0 ; <i1> [#uses=1] - br i1 %cmp.i, label %if.then.i, label %test_cl.exit - -if.then.i: ; preds = %entry - %val = load <16 x i8> addrspace(1)* null ; <<16 x i8>> [#uses=8] - %tmp10.i = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 undef, i8 0, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef>, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 4, i32 undef, i32 6, i32 undef, i32 29, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp17.i = shufflevector <16 x i8> %tmp10.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 18, i32 4, i32 undef, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp24.i = shufflevector <16 x i8> %tmp17.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 24, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 undef, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp31.i = shufflevector <16 x i8> %tmp24.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 21, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp38.i = shufflevector <16 x i8> %tmp31.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 27, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 13, i32 undef, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp45.i = shufflevector <16 x i8> %tmp38.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 undef, i32 10, i32 11, i32 12, i32 13, i32 29, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp52.i = shufflevector <16 x i8> %tmp45.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 21, i32 10, i32 11, i32 12, i32 13, i32 14, i32 undef> ; <<16 x i8>> [#uses=1] - %tmp59.i = shufflevector <16 x i8> %tmp52.i, <16 x i8> %val, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 20> ; <<16 x i8>> [#uses=1] - store <16 x i8> %tmp59.i, <16 x i8> addrspace(1)* null - ret void - -test_cl.exit: ; preds = %entry - ret void -} diff --git a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll index 50c62dfb73b8..ffbe02c71356 100644 --- a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll +++ b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll @@ -1,9 +1,11 @@ -; RUN: llc < %s -march=x86 -mcpu=core2 > %t1 -; RUN: grep movzwl %t1 | count 2 -; RUN: grep movzbl %t1 | count 1 -; RUN: grep movd %t1 | count 4 +; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s define <4 x i16> @a(i32* %x1) nounwind { +; CHECK-LABEL: a: +; CHECK: shrl %[[R:[^,]+]] +; CHECK-NEXT: movd %[[R]], %xmm0 +; CHECK-NEXT: retl + %x2 = load i32* %x1 %x3 = lshr i32 %x2, 1 %x = trunc i32 %x3 to i16 @@ -12,6 +14,12 @@ define <4 x i16> @a(i32* %x1) nounwind { } define <8 x i16> @b(i32* %x1) nounwind { +; CHECK-LABEL: b: +; CHECK: shrl %e[[R:.]]x +; CHECK-NEXT: movzwl %[[R]]x, %e[[R]]x +; CHECK-NEXT: movd %e[[R]]x, %xmm0 +; CHECK-NEXT: retl + %x2 = load i32* %x1 %x3 = lshr i32 %x2, 1 %x = trunc i32 %x3 to i16 @@ -20,6 +28,12 @@ define <8 x i16> @b(i32* %x1) nounwind { } define <8 x i8> @c(i32* %x1) nounwind { +; CHECK-LABEL: c: +; CHECK: shrl %e[[R:.]]x +; CHECK-NEXT: movzwl %[[R]]x, %e[[R]]x +; CHECK-NEXT: movd %e[[R]]x, %xmm0 +; CHECK-NEXT: retl + %x2 = load i32* %x1 %x3 = lshr i32 %x2, 1 %x = trunc i32 %x3 to i8 @@ -28,6 +42,12 @@ define <8 x i8> @c(i32* %x1) nounwind { } define <16 x i8> @d(i32* %x1) nounwind { +; CHECK-LABEL: d: +; CHECK: shrl %e[[R:.]]x +; CHECK-NEXT: movzbl %[[R]]l, %e[[R]]x +; CHECK-NEXT: movd %e[[R]]x, %xmm0 +; CHECK-NEXT: retl + %x2 = load i32* %x1 %x3 = lshr i32 %x2, 1 %x = trunc i32 %x3 to i8 diff --git a/test/CodeGen/X86/2009-10-16-Scope.ll b/test/CodeGen/X86/2009-10-16-Scope.ll index a936edc120d8..e75d594e6682 100644 --- a/test/CodeGen/X86/2009-10-16-Scope.ll +++ b/test/CodeGen/X86/2009-10-16-Scope.ll @@ -9,7 +9,7 @@ entry: br label %do.body, !dbg !0 do.body: ; preds = %entry - call void @llvm.dbg.declare(metadata !{i32* %count_}, metadata !4) + call void @llvm.dbg.declare(metadata i32* %count_, metadata !4, metadata !{!"0x102"}) %conv = ptrtoint i32* %count_ to i32, !dbg !0 ; <i32> [#uses=1] %call = call i32 @foo(i32 %conv) ssp, !dbg !0 ; <i32> [#uses=0] br label %do.end, !dbg !0 @@ -18,17 +18,17 @@ do.end: ; preds = %do.body ret void, !dbg !7 } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone declare i32 @foo(i32) ssp -!0 = metadata !{i32 5, i32 2, metadata !1, null} -!1 = metadata !{i32 458763, null, metadata !2, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ] -!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] -!3 = metadata !{i32 458769, metadata !8, i32 12, metadata !"clang 1.1", i1 true, metadata !"", i32 0, null, metadata !9, null, null, null, metadata !""}; [DW_TAG_compile_unit ] -!4 = metadata !{i32 459008, metadata !5, metadata !"count_", metadata !3, i32 5, metadata !6}; [ DW_TAG_auto_variable ] -!5 = metadata !{i32 458763, null, metadata !1, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ] -!6 = metadata !{i32 458788, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}; [DW_TAG_base_type ] -!7 = metadata !{i32 6, i32 1, metadata !2, null} -!8 = metadata !{metadata !"genmodes.i", metadata !"/Users/yash/Downloads"} -!9 = metadata !{i32 0} +!0 = !MDLocation(line: 5, column: 2, scope: !1) +!1 = !{!"0xb\001\001\000", null, !2}; [DW_TAG_lexical_block ] +!2 = !{!"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", i32 0, !3, null, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!3 = !{!"0x11\0012\00clang 1.1\001\00\000\00\000", !8, null, !9, null, null, null}; [DW_TAG_compile_unit ] +!4 = !{!"0x100\00count_\005\000", !5, !3, !6}; [ DW_TAG_auto_variable ] +!5 = !{!"0xb\001\001\000", null, !1}; [DW_TAG_lexical_block ] +!6 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !3}; [DW_TAG_base_type ] +!7 = !MDLocation(line: 6, column: 1, scope: !2) +!8 = !{!"genmodes.i", !"/Users/yash/Downloads"} +!9 = !{i32 0} diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll index f99e68242811..b21846d39494 100644 --- a/test/CodeGen/X86/2010-01-18-DbgValue.ll +++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll @@ -12,7 +12,7 @@ entry: %retval = alloca double ; <double*> [#uses=2] %0 = alloca double ; <double*> [#uses=2] %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0), !dbg !15 + call void @llvm.dbg.declare(metadata %struct.Rect* %my_r0, metadata !0, metadata !{!"0x102"}), !dbg !15 %1 = getelementptr inbounds %struct.Rect* %my_r0, i32 0, i32 0, !dbg !16 ; <%struct.Pt*> [#uses=1] %2 = getelementptr inbounds %struct.Pt* %1, i32 0, i32 0, !dbg !16 ; <double*> [#uses=1] %3 = load double* %2, align 8, !dbg !16 ; <double> [#uses=1] @@ -26,30 +26,30 @@ return: ; preds = %entry ret double %retval1, !dbg !16 } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!3} !llvm.module.flags = !{!21} -!0 = metadata !{i32 786689, metadata !1, metadata !"my_r0", metadata !2, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ] -!1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 11, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, double (%struct.Rect*)* @foo, null, null, null, i32 11} ; [ DW_TAG_subprogram ] -!2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ] -!3 = metadata !{i32 786449, metadata !19, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!4 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!5 = metadata !{metadata !6, metadata !7} -!6 = metadata !{i32 786468, metadata !19, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] -!7 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Rect", i32 6, i64 256, i64 64, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ] -!8 = metadata !{metadata !9, metadata !14} -!9 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P1", i32 7, i64 128, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ] -!10 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Pt", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ] -!11 = metadata !{metadata !12, metadata !13} -!12 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"x", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ] -!13 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"y", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ] -!14 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P2", i32 8, i64 128, i64 64, i64 128, i32 0, metadata !10} ; [ DW_TAG_member ] -!15 = metadata !{i32 11, i32 0, metadata !1, null} -!16 = metadata !{i32 12, i32 0, metadata !17, null} -!17 = metadata !{i32 786443, metadata !19, metadata !1, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!18 = metadata !{metadata !1} -!19 = metadata !{metadata !"b2.c", metadata !"/tmp/"} -!20 = metadata !{i32 0} -!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x101\00my_r0\0011\000", !1, !2, !7} ; [ DW_TAG_arg_variable ] +!1 = !{!"0x2e\00foo\00foo\00foo\0011\000\001\000\006\000\000\0011", !19, !2, !4, null, double (%struct.Rect*)* @foo, null, null, null} ; [ DW_TAG_subprogram ] +!2 = !{!"0x29", !19} ; [ DW_TAG_file_type ] +!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\000", !19, !20, !20, !18, null, null} ; [ DW_TAG_compile_unit ] +!4 = !{!"0x15\00\000\000\000\000\000\000", !19, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!5 = !{!6, !7} +!6 = !{!"0x24\00double\000\0064\0064\000\000\004", !19, !2} ; [ DW_TAG_base_type ] +!7 = !{!"0x13\00Rect\006\00256\0064\000\000\000", !19, !2, null, !8, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ] +!8 = !{!9, !14} +!9 = !{!"0xd\00P1\007\00128\0064\000\000", !19, !7, !10} ; [ DW_TAG_member ] +!10 = !{!"0x13\00Pt\001\00128\0064\000\000\000", !19, !2, null, !11, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ] +!11 = !{!12, !13} +!12 = !{!"0xd\00x\002\0064\0064\000\000", !19, !10, !6} ; [ DW_TAG_member ] +!13 = !{!"0xd\00y\003\0064\0064\0064\000", !19, !10, !6} ; [ DW_TAG_member ] +!14 = !{!"0xd\00P2\008\00128\0064\00128\000", !19, !7, !10} ; [ DW_TAG_member ] +!15 = !MDLocation(line: 11, scope: !1) +!16 = !MDLocation(line: 12, scope: !17) +!17 = !{!"0xb\0011\000\000", !19, !1} ; [ DW_TAG_lexical_block ] +!18 = !{!1} +!19 = !{!"b2.c", !"/tmp/"} +!20 = !{i32 0} +!21 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll index 4d4e8c197d87..b85f1afea0cf 100644 --- a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll +++ b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll @@ -8,28 +8,28 @@ define i32 @"main(tart.core.String[])->int32"(i32 %args) { entry: - tail call void @llvm.dbg.value(metadata !14, i64 0, metadata !8) + tail call void @llvm.dbg.value(metadata %tart.reflect.ComplexType* @.type.SwitchStmtTest, i64 0, metadata !8, metadata !{!"0x102"}) tail call void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType* @.type.SwitchStmtTest) ; <%tart.core.Object*> [#uses=2] ret i32 3 } -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.ComplexType*) nounwind readnone -!0 = metadata !{i32 458769, metadata !15, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !16, metadata !16, null, null, null, i32 0} ; [ DW_TAG_compile_unit ] -!1 = metadata !{i32 458790, metadata !15, metadata !0, metadata !"", i32 0, i64 192, i64 64, i64 0, i32 0, metadata !2} ; [ DW_TAG_const_type ] -!2 = metadata !{i32 458771, metadata !15, metadata !0, metadata !"C", i32 1, i64 192, i64 64, i64 0, i32 0, null, metadata !3, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ] -!3 = metadata !{metadata !4, metadata !6, metadata !7} -!4 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"x", i32 1, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ] -!5 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] -!6 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"y", i32 1, i64 64, i64 64, i64 64, i32 0, metadata !5} ; [ DW_TAG_member ] -!7 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"z", i32 1, i64 64, i64 64, i64 128, i32 0, metadata !5} ; [ DW_TAG_member ] -!8 = metadata !{i32 459008, metadata !9, metadata !"t", metadata !0, i32 5, metadata !2} ; [ DW_TAG_auto_variable ] -!9 = metadata !{i32 458763, null, metadata !10, i32 0, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!10 = metadata !{i32 458798, i32 0, metadata !0, metadata !"foo", metadata !"foo", metadata !"foo", i32 4, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] -!11 = metadata !{i32 458773, metadata !15, metadata !0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!12 = metadata !{metadata !13} -!13 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!14 = metadata !{%tart.reflect.ComplexType* @.type.SwitchStmtTest} -!15 = metadata !{metadata !"sm.c", metadata !""} -!16 = metadata !{i32 0} +!0 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\000", !15, !16, !16, null, null, null} ; [ DW_TAG_compile_unit ] +!1 = !{!"0x26\00\000\00192\0064\000\000", !15, !0, !2} ; [ DW_TAG_const_type ] +!2 = !{!"0x13\00C\001\00192\0064\000\000\000", !15, !0, null, !3, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ] +!3 = !{!4, !6, !7} +!4 = !{!"0xd\00x\001\0064\0064\000\000", !15, !2, !5} ; [ DW_TAG_member ] +!5 = !{!"0x24\00double\000\0064\0064\000\000\004", !15, !0} ; [ DW_TAG_base_type ] +!6 = !{!"0xd\00y\001\0064\0064\0064\000", !15, !2, !5} ; [ DW_TAG_member ] +!7 = !{!"0xd\00z\001\0064\0064\00128\000", !15, !2, !5} ; [ DW_TAG_member ] +!8 = !{!"0x100\00t\005\000", !9, !0, !2} ; [ DW_TAG_auto_variable ] +!9 = !{!"0xb\000\000\000", null, !10} ; [ DW_TAG_lexical_block ] +!10 = !{!"0x2e\00foo\00foo\00foo\004\000\001\000\006\000\000\000", i32 0, !0, !11, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!11 = !{!"0x15\00\000\000\000\000\000\000", !15, !0, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!12 = !{!13} +!13 = !{!"0x24\00int\000\0032\0032\000\000\005", !15, !0} ; [ DW_TAG_base_type ] +!14 = !{%tart.reflect.ComplexType* @.type.SwitchStmtTest} +!15 = !{!"sm.c", !""} +!16 = !{i32 0} diff --git a/test/CodeGen/X86/2010-02-11-NonTemporal.ll b/test/CodeGen/X86/2010-02-11-NonTemporal.ll index 5789a0b9847f..f9cca8c70c76 100644 --- a/test/CodeGen/X86/2010-02-11-NonTemporal.ll +++ b/test/CodeGen/X86/2010-02-11-NonTemporal.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-unknown-linux-gnu" -!0 = metadata !{ i32 1 } +!0 = !{ i32 1 } define void @sub_(i32* noalias %n) { "file movnt.f90, line 2, bb1": diff --git a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll index 5372bc522785..60025bfcdc81 100644 --- a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll +++ b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll @@ -7,7 +7,7 @@ entry: %tmp1 = bitcast double %a to <8 x i8> %tmp2 = bitcast double %b to <8 x i8> %tmp3 = add <8 x i8> %tmp1, %tmp2 -; CHECK: paddw +; CHECK: paddb store <8 x i8> %tmp3, <8 x i8>* null ret void } @@ -18,7 +18,7 @@ entry: %tmp1 = bitcast double %a to <4 x i16> %tmp2 = bitcast double %b to <4 x i16> %tmp3 = add <4 x i16> %tmp1, %tmp2 -; CHECK: paddd +; CHECK: paddw store <4 x i16> %tmp3, <4 x i16>* null ret void } @@ -29,7 +29,7 @@ entry: %tmp1 = bitcast double %a to <2 x i32> %tmp2 = bitcast double %b to <2 x i32> %tmp3 = add <2 x i32> %tmp1, %tmp2 -; CHECK: paddq +; CHECK: paddd store <2 x i32> %tmp3, <2 x i32>* null ret void } diff --git a/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll b/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll index fc8c895af5b4..86be390b8228 100644 --- a/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll +++ b/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll @@ -29,4 +29,4 @@ entry: ret i8* %1 } -!0 = metadata !{i32 79} +!0 = !{i32 79} diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll index 7faee993a7d1..0d30a3f88eb9 100644 --- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll +++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll @@ -2,8 +2,7 @@ ; RUN: llc -mtriple=x86_64-pc-linux -O2 -regalloc=basic < %s | FileCheck %s ; Test to check .debug_loc support. This test case emits many debug_loc entries. -; CHECK: Loc expr size -; CHECK-NEXT: .short +; CHECK: .short {{.*}} # Loc expr size ; CHECK-NEXT: .Ltmp ; CHECK-NEXT: DW_OP_reg @@ -11,10 +10,10 @@ define hidden %0 @__divsc3(float %a, float %b, float %c, float %d) nounwind readnone { entry: - tail call void @llvm.dbg.value(metadata !{float %a}, i64 0, metadata !0) - tail call void @llvm.dbg.value(metadata !{float %b}, i64 0, metadata !11) - tail call void @llvm.dbg.value(metadata !{float %c}, i64 0, metadata !12) - tail call void @llvm.dbg.value(metadata !{float %d}, i64 0, metadata !13) + tail call void @llvm.dbg.value(metadata float %a, i64 0, metadata !0, metadata !{!"0x102"}) + tail call void @llvm.dbg.value(metadata float %b, i64 0, metadata !11, metadata !{!"0x102"}) + tail call void @llvm.dbg.value(metadata float %c, i64 0, metadata !12, metadata !{!"0x102"}) + tail call void @llvm.dbg.value(metadata float %d, i64 0, metadata !13, metadata !{!"0x102"}) %0 = tail call float @fabsf(float %c) nounwind readnone, !dbg !19 ; <float> [#uses=1] %1 = tail call float @fabsf(float %d) nounwind readnone, !dbg !19 ; <float> [#uses=1] %2 = fcmp olt float %0, %1, !dbg !19 ; <i1> [#uses=1] @@ -22,34 +21,34 @@ entry: bb: ; preds = %entry %3 = fdiv float %c, %d, !dbg !20 ; <float> [#uses=3] - tail call void @llvm.dbg.value(metadata !{float %3}, i64 0, metadata !16), !dbg !20 + tail call void @llvm.dbg.value(metadata float %3, i64 0, metadata !16, metadata !{!"0x102"}), !dbg !20 %4 = fmul float %3, %c, !dbg !21 ; <float> [#uses=1] %5 = fadd float %4, %d, !dbg !21 ; <float> [#uses=2] - tail call void @llvm.dbg.value(metadata !{float %5}, i64 0, metadata !14), !dbg !21 + tail call void @llvm.dbg.value(metadata float %5, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !21 %6 = fmul float %3, %a, !dbg !22 ; <float> [#uses=1] %7 = fadd float %6, %b, !dbg !22 ; <float> [#uses=1] %8 = fdiv float %7, %5, !dbg !22 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %8}, i64 0, metadata !17), !dbg !22 + tail call void @llvm.dbg.value(metadata float %8, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !22 %9 = fmul float %3, %b, !dbg !23 ; <float> [#uses=1] %10 = fsub float %9, %a, !dbg !23 ; <float> [#uses=1] %11 = fdiv float %10, %5, !dbg !23 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %11}, i64 0, metadata !18), !dbg !23 + tail call void @llvm.dbg.value(metadata float %11, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !23 br label %bb2, !dbg !23 bb1: ; preds = %entry %12 = fdiv float %d, %c, !dbg !24 ; <float> [#uses=3] - tail call void @llvm.dbg.value(metadata !{float %12}, i64 0, metadata !16), !dbg !24 + tail call void @llvm.dbg.value(metadata float %12, i64 0, metadata !16, metadata !{!"0x102"}), !dbg !24 %13 = fmul float %12, %d, !dbg !25 ; <float> [#uses=1] %14 = fadd float %13, %c, !dbg !25 ; <float> [#uses=2] - tail call void @llvm.dbg.value(metadata !{float %14}, i64 0, metadata !14), !dbg !25 + tail call void @llvm.dbg.value(metadata float %14, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !25 %15 = fmul float %12, %b, !dbg !26 ; <float> [#uses=1] %16 = fadd float %15, %a, !dbg !26 ; <float> [#uses=1] %17 = fdiv float %16, %14, !dbg !26 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %17}, i64 0, metadata !17), !dbg !26 + tail call void @llvm.dbg.value(metadata float %17, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !26 %18 = fmul float %12, %a, !dbg !27 ; <float> [#uses=1] %19 = fsub float %b, %18, !dbg !27 ; <float> [#uses=1] %20 = fdiv float %19, %14, !dbg !27 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %20}, i64 0, metadata !18), !dbg !27 + tail call void @llvm.dbg.value(metadata float %20, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !27 br label %bb2, !dbg !27 bb2: ; preds = %bb1, %bb @@ -75,9 +74,9 @@ bb6: ; preds = %bb4 bb8: ; preds = %bb6 %27 = tail call float @copysignf(float 0x7FF0000000000000, float %c) nounwind readnone, !dbg !30 ; <float> [#uses=2] %28 = fmul float %27, %a, !dbg !30 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %28}, i64 0, metadata !17), !dbg !30 + tail call void @llvm.dbg.value(metadata float %28, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !30 %29 = fmul float %27, %b, !dbg !31 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %29}, i64 0, metadata !18), !dbg !31 + tail call void @llvm.dbg.value(metadata float %29, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !31 br label %bb46, !dbg !31 bb9: ; preds = %bb6, %bb4 @@ -107,24 +106,24 @@ bb15: ; preds = %bb14 bb16: ; preds = %bb15 %iftmp.0.0 = select i1 %33, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1] %42 = tail call float @copysignf(float %iftmp.0.0, float %a) nounwind readnone, !dbg !33 ; <float> [#uses=2] - tail call void @llvm.dbg.value(metadata !{float %42}, i64 0, metadata !0), !dbg !33 + tail call void @llvm.dbg.value(metadata float %42, i64 0, metadata !0, metadata !{!"0x102"}), !dbg !33 %43 = fcmp ord float %b, 0.000000e+00 ; <i1> [#uses=1] %44 = fsub float %b, %b, !dbg !34 ; <float> [#uses=1] %45 = fcmp uno float %44, 0.000000e+00 ; <i1> [#uses=1] %46 = and i1 %43, %45, !dbg !34 ; <i1> [#uses=1] %iftmp.1.0 = select i1 %46, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1] %47 = tail call float @copysignf(float %iftmp.1.0, float %b) nounwind readnone, !dbg !34 ; <float> [#uses=2] - tail call void @llvm.dbg.value(metadata !{float %47}, i64 0, metadata !11), !dbg !34 + tail call void @llvm.dbg.value(metadata float %47, i64 0, metadata !11, metadata !{!"0x102"}), !dbg !34 %48 = fmul float %42, %c, !dbg !35 ; <float> [#uses=1] %49 = fmul float %47, %d, !dbg !35 ; <float> [#uses=1] %50 = fadd float %48, %49, !dbg !35 ; <float> [#uses=1] %51 = fmul float %50, 0x7FF0000000000000, !dbg !35 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %51}, i64 0, metadata !17), !dbg !35 + tail call void @llvm.dbg.value(metadata float %51, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !35 %52 = fmul float %47, %c, !dbg !36 ; <float> [#uses=1] %53 = fmul float %42, %d, !dbg !36 ; <float> [#uses=1] %54 = fsub float %52, %53, !dbg !36 ; <float> [#uses=1] %55 = fmul float %54, 0x7FF0000000000000, !dbg !36 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %55}, i64 0, metadata !18), !dbg !36 + tail call void @llvm.dbg.value(metadata float %55, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !36 br label %bb46, !dbg !36 bb27: ; preds = %bb15, %bb14, %bb11 @@ -155,24 +154,24 @@ bb34: ; preds = %bb33, %bb30 bb35: ; preds = %bb34 %iftmp.2.0 = select i1 %59, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1] %67 = tail call float @copysignf(float %iftmp.2.0, float %c) nounwind readnone, !dbg !38 ; <float> [#uses=2] - tail call void @llvm.dbg.value(metadata !{float %67}, i64 0, metadata !12), !dbg !38 + tail call void @llvm.dbg.value(metadata float %67, i64 0, metadata !12, metadata !{!"0x102"}), !dbg !38 %68 = fcmp ord float %d, 0.000000e+00 ; <i1> [#uses=1] %69 = fsub float %d, %d, !dbg !39 ; <float> [#uses=1] %70 = fcmp uno float %69, 0.000000e+00 ; <i1> [#uses=1] %71 = and i1 %68, %70, !dbg !39 ; <i1> [#uses=1] %iftmp.3.0 = select i1 %71, float 1.000000e+00, float 0.000000e+00 ; <float> [#uses=1] %72 = tail call float @copysignf(float %iftmp.3.0, float %d) nounwind readnone, !dbg !39 ; <float> [#uses=2] - tail call void @llvm.dbg.value(metadata !{float %72}, i64 0, metadata !13), !dbg !39 + tail call void @llvm.dbg.value(metadata float %72, i64 0, metadata !13, metadata !{!"0x102"}), !dbg !39 %73 = fmul float %67, %a, !dbg !40 ; <float> [#uses=1] %74 = fmul float %72, %b, !dbg !40 ; <float> [#uses=1] %75 = fadd float %73, %74, !dbg !40 ; <float> [#uses=1] %76 = fmul float %75, 0.000000e+00, !dbg !40 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %76}, i64 0, metadata !17), !dbg !40 + tail call void @llvm.dbg.value(metadata float %76, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !40 %77 = fmul float %67, %b, !dbg !41 ; <float> [#uses=1] %78 = fmul float %72, %a, !dbg !41 ; <float> [#uses=1] %79 = fsub float %77, %78, !dbg !41 ; <float> [#uses=1] %80 = fmul float %79, 0.000000e+00, !dbg !41 ; <float> [#uses=1] - tail call void @llvm.dbg.value(metadata !{float %80}, i64 0, metadata !18), !dbg !41 + tail call void @llvm.dbg.value(metadata float %80, i64 0, metadata !18, metadata !{!"0x102"}), !dbg !41 br label %bb46, !dbg !41 bb46: ; preds = %bb35, %bb34, %bb33, %bb30, %bb16, %bb8, %bb2 @@ -196,57 +195,57 @@ declare float @fabsf(float) declare float @copysignf(float, float) nounwind readnone -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!3} !llvm.module.flags = !{!48} -!0 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ] -!1 = metadata !{i32 786478, metadata !45, metadata !2, metadata !"__divsc3", metadata !"__divsc3", metadata !"__divsc3", i32 1922, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, %0 (float, float, float, float)* @__divsc3, null, null, metadata !43, i32 1922} ; [ DW_TAG_subprogram ] -!2 = metadata !{i32 786473, metadata !45} ; [ DW_TAG_file_type ] -!3 = metadata !{i32 786449, metadata !45, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !47, metadata !47, metadata !44, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!4 = metadata !{i32 786453, metadata !45, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!5 = metadata !{metadata !6, metadata !9, metadata !9, metadata !9, metadata !9} -!6 = metadata !{i32 786454, metadata !46, metadata !7, metadata !"SCtype", i32 170, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ] -!7 = metadata !{i32 786473, metadata !46} ; [ DW_TAG_file_type ] -!8 = metadata !{i32 786468, metadata !45, metadata !2, metadata !"complex float", i32 0, i64 64, i64 32, i64 0, i32 0, i32 3} ; [ DW_TAG_base_type ] -!9 = metadata !{i32 786454, metadata !46, metadata !7, metadata !"SFtype", i32 167, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_typedef ] -!10 = metadata !{i32 786468, metadata !45, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] -!11 = metadata !{i32 786689, metadata !1, metadata !"b", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ] -!12 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ] -!13 = metadata !{i32 786689, metadata !1, metadata !"d", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ] -!14 = metadata !{i32 786688, metadata !15, metadata !"denom", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ] -!15 = metadata !{i32 786443, metadata !45, metadata !1, i32 1922, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!16 = metadata !{i32 786688, metadata !15, metadata !"ratio", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ] -!17 = metadata !{i32 786688, metadata !15, metadata !"x", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ] -!18 = metadata !{i32 786688, metadata !15, metadata !"y", metadata !2, i32 1923, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ] -!19 = metadata !{i32 1929, i32 0, metadata !15, null} -!20 = metadata !{i32 1931, i32 0, metadata !15, null} -!21 = metadata !{i32 1932, i32 0, metadata !15, null} -!22 = metadata !{i32 1933, i32 0, metadata !15, null} -!23 = metadata !{i32 1934, i32 0, metadata !15, null} -!24 = metadata !{i32 1938, i32 0, metadata !15, null} -!25 = metadata !{i32 1939, i32 0, metadata !15, null} -!26 = metadata !{i32 1940, i32 0, metadata !15, null} -!27 = metadata !{i32 1941, i32 0, metadata !15, null} -!28 = metadata !{i32 1946, i32 0, metadata !15, null} -!29 = metadata !{i32 1948, i32 0, metadata !15, null} -!30 = metadata !{i32 1950, i32 0, metadata !15, null} -!31 = metadata !{i32 1951, i32 0, metadata !15, null} -!32 = metadata !{i32 1953, i32 0, metadata !15, null} -!33 = metadata !{i32 1955, i32 0, metadata !15, null} -!34 = metadata !{i32 1956, i32 0, metadata !15, null} -!35 = metadata !{i32 1957, i32 0, metadata !15, null} -!36 = metadata !{i32 1958, i32 0, metadata !15, null} -!37 = metadata !{i32 1960, i32 0, metadata !15, null} -!38 = metadata !{i32 1962, i32 0, metadata !15, null} -!39 = metadata !{i32 1963, i32 0, metadata !15, null} -!40 = metadata !{i32 1964, i32 0, metadata !15, null} -!41 = metadata !{i32 1965, i32 0, metadata !15, null} -!42 = metadata !{i32 1969, i32 0, metadata !15, null} -!43 = metadata !{metadata !0, metadata !11, metadata !12, metadata !13, metadata !14, metadata !16, metadata !17, metadata !18} -!44 = metadata !{metadata !1} -!45 = metadata !{metadata !"libgcc2.c", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"} -!46 = metadata !{metadata !"libgcc2.h", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"} -!47 = metadata !{i32 0} -!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x101\00a\001921\000", !1, !2, !9} ; [ DW_TAG_arg_variable ] +!1 = !{!"0x2e\00__divsc3\00__divsc3\00__divsc3\001922\000\001\000\006\000\001\001922", !45, !2, !4, null, %0 (float, float, float, float)* @__divsc3, null, null, !43} ; [ DW_TAG_subprogram ] +!2 = !{!"0x29", !45} ; [ DW_TAG_file_type ] +!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", !45, !47, !47, !44, null, null} ; [ DW_TAG_compile_unit ] +!4 = !{!"0x15\00\000\000\000\000\000\000", !45, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!5 = !{!6, !9, !9, !9, !9} +!6 = !{!"0x16\00SCtype\00170\000\000\000\000", !46, !7, !8} ; [ DW_TAG_typedef ] +!7 = !{!"0x29", !46} ; [ DW_TAG_file_type ] +!8 = !{!"0x24\00complex float\000\0064\0032\000\000\003", !45, !2} ; [ DW_TAG_base_type ] +!9 = !{!"0x16\00SFtype\00167\000\000\000\000", !46, !7, !10} ; [ DW_TAG_typedef ] +!10 = !{!"0x24\00float\000\0032\0032\000\000\004", !45, !2} ; [ DW_TAG_base_type ] +!11 = !{!"0x101\00b\001921\000", !1, !2, !9} ; [ DW_TAG_arg_variable ] +!12 = !{!"0x101\00c\001921\000", !1, !2, !9} ; [ DW_TAG_arg_variable ] +!13 = !{!"0x101\00d\001921\000", !1, !2, !9} ; [ DW_TAG_arg_variable ] +!14 = !{!"0x100\00denom\001923\000", !15, !2, !9} ; [ DW_TAG_auto_variable ] +!15 = !{!"0xb\001922\000\000", !45, !1} ; [ DW_TAG_lexical_block ] +!16 = !{!"0x100\00ratio\001923\000", !15, !2, !9} ; [ DW_TAG_auto_variable ] +!17 = !{!"0x100\00x\001923\000", !15, !2, !9} ; [ DW_TAG_auto_variable ] +!18 = !{!"0x100\00y\001923\000", !15, !2, !9} ; [ DW_TAG_auto_variable ] +!19 = !MDLocation(line: 1929, scope: !15) +!20 = !MDLocation(line: 1931, scope: !15) +!21 = !MDLocation(line: 1932, scope: !15) +!22 = !MDLocation(line: 1933, scope: !15) +!23 = !MDLocation(line: 1934, scope: !15) +!24 = !MDLocation(line: 1938, scope: !15) +!25 = !MDLocation(line: 1939, scope: !15) +!26 = !MDLocation(line: 1940, scope: !15) +!27 = !MDLocation(line: 1941, scope: !15) +!28 = !MDLocation(line: 1946, scope: !15) +!29 = !MDLocation(line: 1948, scope: !15) +!30 = !MDLocation(line: 1950, scope: !15) +!31 = !MDLocation(line: 1951, scope: !15) +!32 = !MDLocation(line: 1953, scope: !15) +!33 = !MDLocation(line: 1955, scope: !15) +!34 = !MDLocation(line: 1956, scope: !15) +!35 = !MDLocation(line: 1957, scope: !15) +!36 = !MDLocation(line: 1958, scope: !15) +!37 = !MDLocation(line: 1960, scope: !15) +!38 = !MDLocation(line: 1962, scope: !15) +!39 = !MDLocation(line: 1963, scope: !15) +!40 = !MDLocation(line: 1964, scope: !15) +!41 = !MDLocation(line: 1965, scope: !15) +!42 = !MDLocation(line: 1969, scope: !15) +!43 = !{!0, !11, !12, !13, !14, !16, !17, !18} +!44 = !{!1} +!45 = !{!"libgcc2.c", !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"} +!46 = !{!"libgcc2.h", !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"} +!47 = !{i32 0} +!48 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll index e11b5382c23f..9915a706e5ee 100644 --- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll +++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll @@ -9,7 +9,7 @@ target triple = "x86_64-apple-darwin10" define i8* @bar(%struct.a* %myvar) nounwind optsize noinline ssp { entry: - tail call void @llvm.dbg.value(metadata !{%struct.a* %myvar}, i64 0, metadata !8) + tail call void @llvm.dbg.value(metadata %struct.a* %myvar, i64 0, metadata !8, metadata !{!"0x102"}) %0 = getelementptr inbounds %struct.a* %myvar, i64 0, i32 0, !dbg !28 ; <i32*> [#uses=1] %1 = load i32* %0, align 8, !dbg !28 ; <i32> [#uses=1] tail call void @foo(i32 %1) nounwind optsize noinline ssp, !dbg !28 @@ -19,49 +19,49 @@ entry: declare void @foo(i32) nounwind optsize noinline ssp -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!2} !llvm.module.flags = !{!38} -!0 = metadata !{i32 786484, i32 0, metadata !1, metadata !"ret", metadata !"ret", metadata !"", metadata !1, i32 7, metadata !3, i1 false, i1 true, null, null} ; [ DW_TAG_variable ] -!1 = metadata !{i32 786473, metadata !36} ; [ DW_TAG_file_type ] -!2 = metadata !{i32 786449, metadata !36, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !37, metadata !37, metadata !32, metadata !31, metadata !37, metadata !""} ; [ DW_TAG_compile_unit ] -!3 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!4 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !1, i32 12, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ] -!5 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, void (i32)* @foo, null, null, metadata !33, i32 13} ; [ DW_TAG_subprogram ] -!6 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!7 = metadata !{null, metadata !3} -!8 = metadata !{i32 786689, metadata !9, metadata !"myvar", metadata !1, i32 17, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ] -!9 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"bar", metadata !"bar", metadata !"bar", i32 17, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i8* (%struct.a*)* @bar, null, null, metadata !34, i32 17} ; [ DW_TAG_subprogram ] -!10 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!11 = metadata !{metadata !12, metadata !13} -!12 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] -!13 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ] -!14 = metadata !{i32 786451, metadata !36, metadata !1, metadata !"a", i32 2, i64 128, i64 64, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ] -!15 = metadata !{metadata !16, metadata !17} -!16 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"c", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !3} ; [ DW_TAG_member ] -!17 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"d", i32 4, i64 64, i64 64, i64 64, i32 0, metadata !13} ; [ DW_TAG_member ] -!18 = metadata !{i32 786689, metadata !19, metadata !"argc", metadata !1, i32 22, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ] -!19 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 22, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, metadata !35, i32 22} ; [ DW_TAG_subprogram ] -!20 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!21 = metadata !{metadata !3, metadata !3, metadata !22} -!22 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ] -!23 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ] -!24 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] -!25 = metadata !{i32 786689, metadata !19, metadata !"argv", metadata !1, i32 22, metadata !22, i32 0, null} ; [ DW_TAG_arg_variable ] -!26 = metadata !{i32 786688, metadata !27, metadata !"e", metadata !1, i32 23, metadata !14, i32 0, null} ; [ DW_TAG_auto_variable ] -!27 = metadata !{i32 786443, metadata !36, metadata !19, i32 22, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!28 = metadata !{i32 18, i32 0, metadata !29, null} -!29 = metadata !{i32 786443, metadata !36, metadata !9, i32 17, i32 0, i32 1} ; [ DW_TAG_lexical_block ] -!30 = metadata !{i32 19, i32 0, metadata !29, null} -!31 = metadata !{metadata !0} -!32 = metadata !{metadata !5, metadata !9, metadata !19} -!33 = metadata !{metadata !4} -!34 = metadata !{metadata !8} -!35 = metadata !{metadata !18, metadata !25, metadata !26} -!36 = metadata !{metadata !"foo.c", metadata !"/tmp/"} -!37 = metadata !{} +!0 = !{!"0x34\00ret\00ret\00\007\000\001", !1, !1, !3, null, null} ; [ DW_TAG_variable ] +!1 = !{!"0x29", !36} ; [ DW_TAG_file_type ] +!2 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", !36, !37, !37, !32, !31, !37} ; [ DW_TAG_compile_unit ] +!3 = !{!"0x24\00int\000\0032\0032\000\000\005", !36, !1} ; [ DW_TAG_base_type ] +!4 = !{!"0x101\00x\0012\000", !5, !1, !3} ; [ DW_TAG_arg_variable ] +!5 = !{!"0x2e\00foo\00foo\00foo\0013\000\001\000\006\000\001\0013", !36, !1, !6, null, void (i32)* @foo, null, null, !33} ; [ DW_TAG_subprogram ] +!6 = !{!"0x15\00\000\000\000\000\000\000", !36, !1, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!7 = !{null, !3} +!8 = !{!"0x101\00myvar\0017\000", !9, !1, !13} ; [ DW_TAG_arg_variable ] +!9 = !{!"0x2e\00bar\00bar\00bar\0017\000\001\000\006\000\001\0017", !36, !1, !10, null, i8* (%struct.a*)* @bar, null, null, !34} ; [ DW_TAG_subprogram ] +!10 = !{!"0x15\00\000\000\000\000\000\000", !36, !1, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!11 = !{!12, !13} +!12 = !{!"0xf\00\000\0064\0064\000\000", !36, !1, null} ; [ DW_TAG_pointer_type ] +!13 = !{!"0xf\00\000\0064\0064\000\000", !36, !1, !14} ; [ DW_TAG_pointer_type ] +!14 = !{!"0x13\00a\002\00128\0064\000\000\000", !36, !1, null, !15, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ] +!15 = !{!16, !17} +!16 = !{!"0xd\00c\003\0032\0032\000\000", !36, !14, !3} ; [ DW_TAG_member ] +!17 = !{!"0xd\00d\004\0064\0064\0064\000", !36, !14, !13} ; [ DW_TAG_member ] +!18 = !{!"0x101\00argc\0022\000", !19, !1, !3} ; [ DW_TAG_arg_variable ] +!19 = !{!"0x2e\00main\00main\00main\0022\000\001\000\006\000\001\0022", !36, !1, !20, null, null, null, null, !35} ; [ DW_TAG_subprogram ] +!20 = !{!"0x15\00\000\000\000\000\000\000", !36, !1, null, !21, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!21 = !{!3, !3, !22} +!22 = !{!"0xf\00\000\0064\0064\000\000", !36, !1, !23} ; [ DW_TAG_pointer_type ] +!23 = !{!"0xf\00\000\0064\0064\000\000", !36, !1, !24} ; [ DW_TAG_pointer_type ] +!24 = !{!"0x24\00char\000\008\008\000\000\006", !36, !1} ; [ DW_TAG_base_type ] +!25 = !{!"0x101\00argv\0022\000", !19, !1, !22} ; [ DW_TAG_arg_variable ] +!26 = !{!"0x100\00e\0023\000", !27, !1, !14} ; [ DW_TAG_auto_variable ] +!27 = !{!"0xb\0022\000\000", !36, !19} ; [ DW_TAG_lexical_block ] +!28 = !MDLocation(line: 18, scope: !29) +!29 = !{!"0xb\0017\000\001", !36, !9} ; [ DW_TAG_lexical_block ] +!30 = !MDLocation(line: 19, scope: !29) +!31 = !{!0} +!32 = !{!5, !9, !19} +!33 = !{!4} +!34 = !{!8} +!35 = !{!18, !25, !26} +!36 = !{!"foo.c", !"/tmp/"} +!37 = !{} ; The variable bar:myvar changes registers after the first movq. ; It is cobbered by popq %rbx @@ -73,18 +73,22 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone ; CHECK: Ldebug_loc0: -; CHECK-NEXT: .quad Lfunc_begin0 -; CHECK-NEXT: .quad [[LABEL]] +; CHECK-NEXT: [[SET1:.*]] = Lfunc_begin0-Lfunc_begin0 +; CHECK-NEXT: .quad [[SET1]] +; CHECK-NEXT: [[SET2:.*]] = [[LABEL]]-Lfunc_begin0 +; CHECK-NEXT: .quad [[SET2]] ; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}} ## Loc expr size ; CHECK-NEXT: .short Lset{{.*}} ; CHECK-NEXT: Ltmp{{.*}}: ; CHECK-NEXT: .byte 85 ; CHECK-NEXT: Ltmp{{.*}}: -; CHECK-NEXT: .quad [[LABEL]] -; CHECK-NEXT: .quad [[CLOBBER]] +; CHECK-NEXT: [[SET3:.*]] = [[LABEL]]-Lfunc_begin0 +; CHECK-NEXT: .quad [[SET3]] +; CHECK-NEXT: [[SET4:.*]] = [[CLOBBER]]-Lfunc_begin0 +; CHECK-NEXT: .quad [[SET4]] ; CHECK-NEXT: Lset{{.*}} = Ltmp{{.*}}-Ltmp{{.*}} ## Loc expr size ; CHECK-NEXT: .short Lset{{.*}} ; CHECK-NEXT: Ltmp{{.*}}: ; CHECK-NEXT: .byte 83 ; CHECK-NEXT: Ltmp{{.*}}: -!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!38 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll index 1114c8dc87bb..7adacf5e0176 100644 --- a/test/CodeGen/X86/2010-05-28-Crash.ll +++ b/test/CodeGen/X86/2010-05-28-Crash.ll @@ -4,19 +4,19 @@ define i32 @foo(i32 %y) nounwind optsize ssp { entry: - tail call void @llvm.dbg.value(metadata !{i32 %y}, i64 0, metadata !0) + tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !0, metadata !{!"0x102"}) %0 = tail call i32 (...)* @zoo(i32 %y) nounwind, !dbg !9 ; <i32> [#uses=1] ret i32 %0, !dbg !9 } declare i32 @zoo(...) -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone define i32 @bar(i32 %x) nounwind optsize ssp { entry: - tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !7) - tail call void @llvm.dbg.value(metadata !11, i64 0, metadata !0) nounwind + tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !7, metadata !{!"0x102"}) + tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !0, metadata !{!"0x102"}) nounwind %0 = tail call i32 (...)* @zoo(i32 1) nounwind, !dbg !12 ; <i32> [#uses=1] %1 = add nsw i32 %0, %x, !dbg !13 ; <i32> [#uses=1] ret i32 %1, !dbg !13 @@ -25,28 +25,28 @@ entry: !llvm.dbg.cu = !{!3} !llvm.module.flags = !{!20} -!0 = metadata !{i32 786689, metadata !1, metadata !"y", metadata !2, i32 2, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ] -!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @foo, null, null, metadata !15, i32 2} ; [ DW_TAG_subprogram ] -!2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ] -!3 = metadata !{i32 786449, metadata !18, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !19, metadata !19, metadata !17, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!4 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!5 = metadata !{metadata !6, metadata !6} -!6 = metadata !{i32 786468, metadata !18, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!7 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !2, i32 6, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ] -!8 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"bar", metadata !"bar", metadata !"bar", i32 6, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @bar, null, null, metadata !16, i32 6} ; [ DW_TAG_subprogram ] -!9 = metadata !{i32 3, i32 0, metadata !10, null} -!10 = metadata !{i32 786443, metadata !18, metadata !1, i32 2, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!11 = metadata !{i32 1} -!12 = metadata !{i32 3, i32 0, metadata !10, metadata !13} -!13 = metadata !{i32 7, i32 0, metadata !14, null} -!14 = metadata !{i32 786443, metadata !18, metadata !8, i32 6, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!15 = metadata !{metadata !0} -!16 = metadata !{metadata !7} -!17 = metadata !{metadata !1, metadata !8} -!18 = metadata !{metadata !"f.c", metadata !"/tmp"} -!19 = metadata !{i32 0} +!0 = !{!"0x101\00y\002\000", !1, !2, !6} ; [ DW_TAG_arg_variable ] +!1 = !{!"0x2e\00foo\00foo\00foo\002\000\001\000\006\000\001\002", !18, !2, !4, null, i32 (i32)* @foo, null, null, !15} ; [ DW_TAG_subprogram ] +!2 = !{!"0x29", !18} ; [ DW_TAG_file_type ] +!3 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\001\00\000\00\001", !18, !19, !19, !17, null, null} ; [ DW_TAG_compile_unit ] +!4 = !{!"0x15\00\000\000\000\000\000\000", !18, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!5 = !{!6, !6} +!6 = !{!"0x24\00int\000\0032\0032\000\000\005", !18, !2} ; [ DW_TAG_base_type ] +!7 = !{!"0x101\00x\006\000", !8, !2, !6} ; [ DW_TAG_arg_variable ] +!8 = !{!"0x2e\00bar\00bar\00bar\006\000\001\000\006\000\001\006", !18, !2, !4, null, i32 (i32)* @bar, null, null, !16} ; [ DW_TAG_subprogram ] +!9 = !MDLocation(line: 3, scope: !10) +!10 = !{!"0xb\002\000\000", !18, !1} ; [ DW_TAG_lexical_block ] +!11 = !{i32 1} +!12 = !MDLocation(line: 3, scope: !10, inlinedAt: !13) +!13 = !MDLocation(line: 7, scope: !14) +!14 = !{!"0xb\006\000\000", !18, !8} ; [ DW_TAG_lexical_block ] +!15 = !{!0} +!16 = !{!7} +!17 = !{!1, !8} +!18 = !{!"f.c", !"/tmp"} +!19 = !{i32 0} ;CHECK: DEBUG_VALUE: bar:x <- E ;CHECK: Ltmp ;CHECK: DEBUG_VALUE: foo:y <- 1{{$}} -!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!20 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll index b45ac226a650..3687b828c4a4 100644 --- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll +++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll @@ -10,51 +10,51 @@ target triple = "x86_64-apple-darwin10.2" define i32 @_ZN3foo3bazEi(%struct.foo* nocapture %this, i32 %x) nounwind readnone optsize noinline ssp align 2 { ;CHECK: DEBUG_VALUE: baz:this <- RDI{{$}} entry: - tail call void @llvm.dbg.value(metadata !{%struct.foo* %this}, i64 0, metadata !15) - tail call void @llvm.dbg.value(metadata !{i32 %x}, i64 0, metadata !16) + tail call void @llvm.dbg.value(metadata %struct.foo* %this, i64 0, metadata !15, metadata !{!"0x102"}) + tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !16, metadata !{!"0x102"}) %0 = mul nsw i32 %x, 7, !dbg !29 ; <i32> [#uses=1] %1 = add nsw i32 %0, 1, !dbg !29 ; <i32> [#uses=1] ret i32 %1, !dbg !29 } -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!4} !llvm.module.flags = !{!34} !llvm.dbg.lv = !{!0, !14, !15, !16, !17, !24, !25, !28} -!0 = metadata !{i32 786689, metadata !1, metadata !"this", metadata !3, i32 11, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ] -!1 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEi", i32 11, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null, i32 11} ; [ DW_TAG_subprogram ] -!2 = metadata !{i32 786451, metadata !31, metadata !3, metadata !"foo", i32 3, i64 32, i64 32, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ] -!3 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ] -!4 = metadata !{i32 786449, metadata !31, i32 4, metadata !"4.2.1 LLVM build", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !33, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!5 = metadata !{metadata !6, metadata !1, metadata !8} -!6 = metadata !{i32 786445, metadata !31, metadata !2, metadata !"y", i32 8, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ] -!7 = metadata !{i32 786468, metadata !31, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!8 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"baz", metadata !"baz", metadata !"_ZN3foo3bazEi", i32 15, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null, i32 15} ; [ DW_TAG_subprogram ] -!9 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!10 = metadata !{metadata !7, metadata !11, metadata !7} -!11 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !2} ; [ DW_TAG_pointer_type ] -!12 = metadata !{i32 786470, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !13} ; [ DW_TAG_const_type ] -!13 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !2} ; [ DW_TAG_pointer_type ] -!14 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !3, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ] -!15 = metadata !{i32 786689, metadata !8, metadata !"this", metadata !3, i32 15, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ] -!16 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !3, i32 15, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ] -!17 = metadata !{i32 786689, metadata !18, metadata !"argc", metadata !3, i32 19, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ] -!18 = metadata !{i32 786478, metadata !31, metadata !3, metadata !"main", metadata !"main", metadata !"main", i32 19, metadata !19, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, null, null, null, null, i32 19} ; [ DW_TAG_subprogram ] -!19 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!20 = metadata !{metadata !7, metadata !7, metadata !21} -!21 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ] -!22 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ] -!23 = metadata !{i32 786468, metadata !31, metadata !3, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] -!24 = metadata !{i32 786689, metadata !18, metadata !"argv", metadata !3, i32 19, metadata !21, i32 0, null} ; [ DW_TAG_arg_variable ] -!25 = metadata !{i32 786688, metadata !26, metadata !"a", metadata !3, i32 20, metadata !2, i32 0, null} ; [ DW_TAG_auto_variable ] -!26 = metadata !{i32 786443, metadata !31, metadata !27, i32 19, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!27 = metadata !{i32 786443, metadata !31, metadata !18, i32 19, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!28 = metadata !{i32 786688, metadata !26, metadata !"b", metadata !3, i32 21, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ] -!29 = metadata !{i32 16, i32 0, metadata !30, null} -!30 = metadata !{i32 786443, metadata !31, metadata !8, i32 15, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!31 = metadata !{metadata !"foo.cp", metadata !"/tmp/"} -!32 = metadata !{i32 0} -!33 = metadata !{metadata !1, metadata !8, metadata !18} -!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x101\00this\0011\000", !1, !3, !12} ; [ DW_TAG_arg_variable ] +!1 = !{!"0x2e\00bar\00bar\00_ZN3foo3barEi\0011\000\001\000\006\000\001\0011", !31, !2, !9, null, i32 (%struct.foo*, i32)* null, null, null, null} ; [ DW_TAG_subprogram ] +!2 = !{!"0x13\00foo\003\0032\0032\000\000\000", !31, !3, null, !5, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ] +!3 = !{!"0x29", !31} ; [ DW_TAG_file_type ] +!4 = !{!"0x11\004\004.2.1 LLVM build\001\00\000\00\000", !31, !32, !32, !33, null, null} ; [ DW_TAG_compile_unit ] +!5 = !{!6, !1, !8} +!6 = !{!"0xd\00y\008\0032\0032\000\000", !31, !2, !7} ; [ DW_TAG_member ] +!7 = !{!"0x24\00int\000\0032\0032\000\000\005", !31, !3} ; [ DW_TAG_base_type ] +!8 = !{!"0x2e\00baz\00baz\00_ZN3foo3bazEi\0015\000\001\000\006\000\001\0015", !31, !2, !9, null, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null} ; [ DW_TAG_subprogram ] +!9 = !{!"0x15\00\000\000\000\000\000\000", !31, !3, null, !10, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!10 = !{!7, !11, !7} +!11 = !{!"0xf\00\000\0064\0064\000\0064", !31, !3, !2} ; [ DW_TAG_pointer_type ] +!12 = !{!"0x26\00\000\0064\0064\000\0064", !31, !3, !13} ; [ DW_TAG_const_type ] +!13 = !{!"0xf\00\000\0064\0064\000\000", !31, !3, !2} ; [ DW_TAG_pointer_type ] +!14 = !{!"0x101\00x\0011\000", !1, !3, !7} ; [ DW_TAG_arg_variable ] +!15 = !{!"0x101\00this\0015\000", !8, !3, !12} ; [ DW_TAG_arg_variable ] +!16 = !{!"0x101\00x\0015\000", !8, !3, !7} ; [ DW_TAG_arg_variable ] +!17 = !{!"0x101\00argc\0019\000", !18, !3, !7} ; [ DW_TAG_arg_variable ] +!18 = !{!"0x2e\00main\00main\00main\0019\000\001\000\006\000\001\0019", !31, !3, !19, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!19 = !{!"0x15\00\000\000\000\000\000\000", !31, !3, null, !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!20 = !{!7, !7, !21} +!21 = !{!"0xf\00\000\0064\0064\000\000", !31, !3, !22} ; [ DW_TAG_pointer_type ] +!22 = !{!"0xf\00\000\0064\0064\000\000", !31, !3, !23} ; [ DW_TAG_pointer_type ] +!23 = !{!"0x24\00char\000\008\008\000\000\006", !31, !3} ; [ DW_TAG_base_type ] +!24 = !{!"0x101\00argv\0019\000", !18, !3, !21} ; [ DW_TAG_arg_variable ] +!25 = !{!"0x100\00a\0020\000", !26, !3, !2} ; [ DW_TAG_auto_variable ] +!26 = !{!"0xb\0019\000\000", !31, !27} ; [ DW_TAG_lexical_block ] +!27 = !{!"0xb\0019\000\000", !31, !18} ; [ DW_TAG_lexical_block ] +!28 = !{!"0x100\00b\0021\000", !26, !3, !7} ; [ DW_TAG_auto_variable ] +!29 = !MDLocation(line: 16, scope: !30) +!30 = !{!"0xb\0015\000\000", !31, !8} ; [ DW_TAG_lexical_block ] +!31 = !{!"foo.cp", !"/tmp/"} +!32 = !{i32 0} +!33 = !{!1, !8, !18} +!34 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll index 0f8855d1267e..74a7610e6597 100644 --- a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll +++ b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll @@ -26,4 +26,4 @@ entry: declare i32 @printf(i8*, ...) -!0 = metadata !{i32 191} +!0 = !{i32 191} diff --git a/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll b/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll index 0df9dc1cb769..3470a06a543b 100644 --- a/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll +++ b/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll @@ -16,4 +16,4 @@ entry: declare x86_stdcallcc void @RtlUnwind(...) -!0 = metadata !{i32 215} +!0 = !{i32 215} diff --git a/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll b/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll index d7bc21f6393a..7cffdc545e02 100644 --- a/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll +++ b/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll @@ -19,4 +19,4 @@ entry: ret i32 %asmresult } -!0 = metadata !{i32 108} +!0 = !{i32 108} diff --git a/test/CodeGen/X86/2010-07-06-DbgCrash.ll b/test/CodeGen/X86/2010-07-06-DbgCrash.ll index b49aec3af87a..457c49852dca 100644 --- a/test/CodeGen/X86/2010-07-06-DbgCrash.ll +++ b/test/CodeGen/X86/2010-07-06-DbgCrash.ll @@ -3,29 +3,29 @@ @.str = private constant [4 x i8] c"one\00", align 1 ; <[4 x i8]*> [#uses=1] @.str1 = private constant [4 x i8] c"two\00", align 1 ; <[5 x i8]*> [#uses=1] @C.9.2167 = internal constant [2 x i8*] [i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0)] -!38 = metadata !{i32 524329, metadata !109} ; [ DW_TAG_file_type ] -!39 = metadata !{i32 524305, metadata !109, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", i1 true, metadata !"", i32 0, metadata !108, metadata !108, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!46 = metadata !{i32 524303, metadata !109, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !47} ; [ DW_TAG_pointer_type ] -!47 = metadata !{i32 524324, metadata !109, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] -!97 = metadata !{i32 524334, i32 0, metadata !39, metadata !"main", metadata !"main", metadata !"main", i32 73, metadata !98, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] -!98 = metadata !{i32 524309, metadata !109, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !99, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!99 = metadata !{metadata !100} -!100 = metadata !{i32 524324, metadata !109, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!101 = metadata !{[2 x i8*]* @C.9.2167} -!102 = metadata !{i32 524544, metadata !103, metadata !"find_strings", metadata !38, i32 75, metadata !104, i32 0, i32 0} ; [ DW_TAG_auto_variable ] -!103 = metadata !{i32 524299, null, metadata !97, i32 73, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!104 = metadata !{i32 524289, metadata !109, null, metadata !"", i32 0, i64 85312, i64 64, i64 0, i32 0, metadata !46, metadata !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ] -!105 = metadata !{metadata !106} -!106 = metadata !{i32 524321, i64 0, i64 1333} ; [ DW_TAG_subrange_type ] -!107 = metadata !{i32 73, i32 0, metadata !103, null} -!108 = metadata !{i32 0} -!109 = metadata !{metadata !"pbmsrch.c", metadata !"/Users/grawp/LLVM/test-suite/MultiSource/Benchmarks/MiBench/office-stringsearch"} +!38 = !{!"0x29", !109} ; [ DW_TAG_file_type ] +!39 = !{!"0x11\001\004.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)\001\00\000\00\000", !109, !108, !108, null, null, null} ; [ DW_TAG_compile_unit ] +!46 = !{!"0xf\00\000\0064\0064\000\000", !109, null, !47} ; [ DW_TAG_pointer_type ] +!47 = !{!"0x24\00char\000\008\008\000\000\006", !109, null} ; [ DW_TAG_base_type ] +!97 = !{!"0x2e\00main\00main\00main\0073\000\001\000\006\000\000\000", i32 0, !39, !98, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!98 = !{!"0x15\00\000\000\000\000\000\000", !109, null, null, !99, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!99 = !{!100} +!100 = !{!"0x24\00int\000\0032\0032\000\000\005", !109, null} ; [ DW_TAG_base_type ] +!101 = !{[2 x i8*]* @C.9.2167} +!102 = !{!"0x100\00find_strings\0075\000", !103, !38, !104} ; [ DW_TAG_auto_variable ] +!103 = !{!"0xb\0073\000\000", null, !97} ; [ DW_TAG_lexical_block ] +!104 = !{!"0x1\00\000\0085312\0064\000\000", !109, null, !46, !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ] +!105 = !{!106} +!106 = !{!"0x21\000\001333"} ; [ DW_TAG_subrange_type ] +!107 = !MDLocation(line: 73, scope: !103) +!108 = !{i32 0} +!109 = !{!"pbmsrch.c", !"/Users/grawp/LLVM/test-suite/MultiSource/Benchmarks/MiBench/office-stringsearch"} define i32 @main() nounwind ssp { bb.nph: - tail call void @llvm.dbg.declare(metadata !101, metadata !102), !dbg !107 + tail call void @llvm.dbg.declare(metadata [2 x i8*]* @C.9.2167, metadata !102, metadata !{!"0x102"}), !dbg !107 ret i32 0, !dbg !107 } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll index 09e34ef6b7f5..e3decf0c889a 100644 --- a/test/CodeGen/X86/2010-08-04-StackVariable.ll +++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll @@ -6,8 +6,8 @@ define i32 @_Z3fooi4SVal(i32 %i, %struct.SVal* noalias %location) nounwind ssp { entry: %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !23), !dbg !24 - call void @llvm.dbg.value(metadata !{%struct.SVal* %location}, i64 0, metadata !25), !dbg !24 + call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !23, metadata !{!"0x102"}), !dbg !24 + call void @llvm.dbg.value(metadata %struct.SVal* %location, i64 0, metadata !25, metadata !{!"0x102"}), !dbg !24 %0 = icmp ne i32 %i, 0, !dbg !27 ; <i1> [#uses=1] br i1 %0, label %bb, label %bb1, !dbg !27 @@ -34,7 +34,7 @@ return: ; preds = %bb2 define linkonce_odr void @_ZN4SValC1Ev(%struct.SVal* %this) nounwind ssp align 2 { entry: %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - call void @llvm.dbg.value(metadata !{%struct.SVal* %this}, i64 0, metadata !31), !dbg !34 + call void @llvm.dbg.value(metadata %struct.SVal* %this, i64 0, metadata !31, metadata !{!"0x102"}), !dbg !34 %0 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 0, !dbg !34 ; <i8**> [#uses=1] store i8* null, i8** %0, align 8, !dbg !34 %1 = getelementptr inbounds %struct.SVal* %this, i32 0, i32 1, !dbg !34 ; <i32*> [#uses=1] @@ -45,14 +45,14 @@ return: ; preds = %entry ret void, !dbg !35 } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone define i32 @main() nounwind ssp { entry: %0 = alloca %struct.SVal ; <%struct.SVal*> [#uses=3] %v = alloca %struct.SVal ; <%struct.SVal*> [#uses=4] %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - call void @llvm.dbg.declare(metadata !{%struct.SVal* %v}, metadata !38), !dbg !41 + call void @llvm.dbg.declare(metadata %struct.SVal* %v, metadata !38, metadata !{!"0x102"}), !dbg !41 call void @_ZN4SValC1Ev(%struct.SVal* %v) nounwind, !dbg !41 %1 = getelementptr inbounds %struct.SVal* %v, i32 0, i32 1, !dbg !42 ; <i32*> [#uses=1] store i32 1, i32* %1, align 8, !dbg !42 @@ -65,65 +65,65 @@ entry: %7 = load i32* %6, align 8, !dbg !43 ; <i32> [#uses=1] store i32 %7, i32* %5, align 8, !dbg !43 %8 = call i32 @_Z3fooi4SVal(i32 2, %struct.SVal* noalias %0) nounwind, !dbg !43 ; <i32> [#uses=0] - call void @llvm.dbg.value(metadata !{i32 %8}, i64 0, metadata !44), !dbg !43 + call void @llvm.dbg.value(metadata i32 %8, i64 0, metadata !44, metadata !{!"0x102"}), !dbg !43 br label %return, !dbg !45 return: ; preds = %entry ret i32 0, !dbg !45 } -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!3} !llvm.module.flags = !{!49} -!46 = metadata !{metadata !16, metadata !17, metadata !20} +!46 = !{!16, !17, !20} -!0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 11} ; [ DW_TAG_subprogram ] -!1 = metadata !{i32 786451, metadata !47, metadata !2, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ] -!2 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ] -!3 = metadata !{i32 786449, metadata !47, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !48, metadata !48, metadata !46, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!4 = metadata !{metadata !5, metadata !7, metadata !0, metadata !9} -!5 = metadata !{i32 786445, metadata !47, metadata !1, metadata !"Data", i32 7, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ] -!6 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] -!7 = metadata !{i32 786445, metadata !47, metadata !1, metadata !"Kind", i32 8, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ] -!8 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] -!9 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"~SVal", metadata !"~SVal", metadata !"", i32 12, metadata !10, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 12} ; [ DW_TAG_subprogram ] -!10 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!11 = metadata !{null, metadata !12, metadata !13} -!12 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ] -!13 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!14 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!15 = metadata !{null, metadata !12} -!16 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"_ZN4SValC1Ev", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null, i32 11} ; [ DW_TAG_subprogram ] -!17 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3fooi4SVal", i32 16, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null, i32 16} ; [ DW_TAG_subprogram ] -!18 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!19 = metadata !{metadata !13, metadata !13, metadata !1} -!20 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"main", metadata !"main", metadata !"main", i32 23, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @main, null, null, null, i32 23} ; [ DW_TAG_subprogram ] -!21 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!22 = metadata !{metadata !13} -!23 = metadata !{i32 786689, metadata !17, metadata !"i", metadata !2, i32 16, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ] -!24 = metadata !{i32 16, i32 0, metadata !17, null} -!25 = metadata !{i32 786689, metadata !17, metadata !"location", metadata !2, i32 16, metadata !26, i32 0, null} ; [ DW_TAG_arg_variable ] -!26 = metadata !{i32 786448, metadata !47, metadata !2, metadata !"SVal", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_reference_type ] -!27 = metadata !{i32 17, i32 0, metadata !28, null} -!28 = metadata !{i32 786443, metadata !47, metadata !17, i32 16, i32 0, i32 2} ; [ DW_TAG_lexical_block ] -!29 = metadata !{i32 18, i32 0, metadata !28, null} -!30 = metadata !{i32 20, i32 0, metadata !28, null} -!31 = metadata !{i32 786689, metadata !16, metadata !"this", metadata !2, i32 11, metadata !32, i32 0, null} ; [ DW_TAG_arg_variable ] -!32 = metadata !{i32 786470, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !33} ; [ DW_TAG_const_type ] -!33 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !1} ; [ DW_TAG_pointer_type ] -!34 = metadata !{i32 11, i32 0, metadata !16, null} -!35 = metadata !{i32 11, i32 0, metadata !36, null} -!36 = metadata !{i32 786443, metadata !47, metadata !37, i32 11, i32 0, i32 1} ; [ DW_TAG_lexical_block ] -!37 = metadata !{i32 786443, metadata !47, metadata !16, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ] -!38 = metadata !{i32 786688, metadata !39, metadata !"v", metadata !2, i32 24, metadata !1, i32 0, null} ; [ DW_TAG_auto_variable ] -!39 = metadata !{i32 786443, metadata !47, metadata !40, i32 23, i32 0, i32 4} ; [ DW_TAG_lexical_block ] -!40 = metadata !{i32 786443, metadata !47, metadata !20, i32 23, i32 0, i32 3} ; [ DW_TAG_lexical_block ] -!41 = metadata !{i32 24, i32 0, metadata !39, null} -!42 = metadata !{i32 25, i32 0, metadata !39, null} -!43 = metadata !{i32 26, i32 0, metadata !39, null} -!44 = metadata !{i32 786688, metadata !39, metadata !"k", metadata !2, i32 26, metadata !13, i32 0, null} ; [ DW_TAG_auto_variable ] -!45 = metadata !{i32 27, i32 0, metadata !39, null} -!47 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"} -!48 = metadata !{i32 0} -!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x2e\00SVal\00SVal\00\0011\000\000\000\006\000\000\0011", !47, !1, !14, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!1 = !{!"0x13\00SVal\001\00128\0064\000\000\000", !47, !2, null, !4, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ] +!2 = !{!"0x29", !47} ; [ DW_TAG_file_type ] +!3 = !{!"0x11\004\004.2.1 (Based on Apple Inc. build 5658) (LLVM build)\000\00\000\00\001", !47, !48, !48, !46, null, null} ; [ DW_TAG_compile_unit ] +!4 = !{!5, !7, !0, !9} +!5 = !{!"0xd\00Data\007\0064\0064\000\000", !47, !1, !6} ; [ DW_TAG_member ] +!6 = !{!"0xf\00\000\0064\0064\000\000", !47, !2, null} ; [ DW_TAG_pointer_type ] +!7 = !{!"0xd\00Kind\008\0032\0032\0064\000", !47, !1, !8} ; [ DW_TAG_member ] +!8 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", !47, !2} ; [ DW_TAG_base_type ] +!9 = !{!"0x2e\00~SVal\00~SVal\00\0012\000\000\000\006\000\000\0012", !47, !1, !10, null, null, null, null, null} ; [ DW_TAG_subprogram ] +!10 = !{!"0x15\00\000\000\000\000\000\000", !47, !2, null, !11, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!11 = !{null, !12, !13} +!12 = !{!"0xf\00\000\0064\0064\000\0064", !47, !2, !1} ; [ DW_TAG_pointer_type ] +!13 = !{!"0x24\00int\000\0032\0032\000\000\005", !47, !2} ; [ DW_TAG_base_type ] +!14 = !{!"0x15\00\000\000\000\000\000\000", !47, !2, null, !15, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!15 = !{null, !12} +!16 = !{!"0x2e\00SVal\00SVal\00_ZN4SValC1Ev\0011\000\001\000\006\000\000\0011", !47, !1, !14, null, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null} ; [ DW_TAG_subprogram ] +!17 = !{!"0x2e\00foo\00foo\00_Z3fooi4SVal\0016\000\001\000\006\000\000\0016", !47, !2, !18, null, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null} ; [ DW_TAG_subprogram ] +!18 = !{!"0x15\00\000\000\000\000\000\000", !47, !2, null, !19, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!19 = !{!13, !13, !1} +!20 = !{!"0x2e\00main\00main\00main\0023\000\001\000\006\000\000\0023", !47, !2, !21, null, i32 ()* @main, null, null, null} ; [ DW_TAG_subprogram ] +!21 = !{!"0x15\00\000\000\000\000\000\000", !47, !2, null, !22, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!22 = !{!13} +!23 = !{!"0x101\00i\0016\000", !17, !2, !13} ; [ DW_TAG_arg_variable ] +!24 = !MDLocation(line: 16, scope: !17) +!25 = !{!"0x101\00location\0016\000", !17, !2, !26} ; [ DW_TAG_arg_variable ] +!26 = !{!"0x10\00SVal\000\0064\0064\000\000", !47, !2, !1} ; [ DW_TAG_reference_type ] +!27 = !MDLocation(line: 17, scope: !28) +!28 = !{!"0xb\0016\000\002", !47, !17} ; [ DW_TAG_lexical_block ] +!29 = !MDLocation(line: 18, scope: !28) +!30 = !MDLocation(line: 20, scope: !28) +!31 = !{!"0x101\00this\0011\000", !16, !2, !32} ; [ DW_TAG_arg_variable ] +!32 = !{!"0x26\00\000\0064\0064\000\0064", !47, !2, !33} ; [ DW_TAG_const_type ] +!33 = !{!"0xf\00\000\0064\0064\000\000", !47, !2, !1} ; [ DW_TAG_pointer_type ] +!34 = !MDLocation(line: 11, scope: !16) +!35 = !MDLocation(line: 11, scope: !36) +!36 = !{!"0xb\0011\000\001", !47, !37} ; [ DW_TAG_lexical_block ] +!37 = !{!"0xb\0011\000\000", !47, !16} ; [ DW_TAG_lexical_block ] +!38 = !{!"0x100\00v\0024\000", !39, !2, !1} ; [ DW_TAG_auto_variable ] +!39 = !{!"0xb\0023\000\004", !47, !40} ; [ DW_TAG_lexical_block ] +!40 = !{!"0xb\0023\000\003", !47, !20} ; [ DW_TAG_lexical_block ] +!41 = !MDLocation(line: 24, scope: !39) +!42 = !MDLocation(line: 25, scope: !39) +!43 = !MDLocation(line: 26, scope: !39) +!44 = !{!"0x100\00k\0026\000", !39, !2, !13} ; [ DW_TAG_auto_variable ] +!45 = !MDLocation(line: 27, scope: !39) +!47 = !{!"small.cc", !"/Users/manav/R8248330"} +!48 = !{i32 0} +!49 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll index a65b632691ae..cf9897ac03ad 100644 --- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll +++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll @@ -15,21 +15,21 @@ entry: !llvm.dbg.cu = !{!2} !llvm.module.flags = !{!17} -!0 = metadata !{i32 786478, metadata !14, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 53, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ] -!1 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ] -!2 = metadata !{i32 786449, metadata !15, i32 12, metadata !"clang version 2.9 (trunk 114084)", i1 false, metadata !"", i32 0, metadata !16, metadata !16, metadata !13, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!3 = metadata !{i32 786453, metadata !14, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!4 = metadata !{metadata !5} -!5 = metadata !{i32 786468, metadata !14, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!6 = metadata !{i32 786478, metadata !15, metadata !7, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ] -!7 = metadata !{i32 786473, metadata !15} ; [ DW_TAG_file_type ] -!8 = metadata !{i32 53, i32 13, metadata !9, null} -!9 = metadata !{i32 786443, metadata !14, metadata !0, i32 53, i32 11, i32 0} ; [ DW_TAG_lexical_block ] -!10 = metadata !{i32 4, i32 13, metadata !11, null} -!11 = metadata !{i32 786443, metadata !15, metadata !12, i32 4, i32 13, i32 2} ; [ DW_TAG_lexical_block ] -!12 = metadata !{i32 786443, metadata !15, metadata !6, i32 4, i32 11, i32 1} ; [ DW_TAG_lexical_block ] -!13 = metadata !{metadata !0, metadata !6} -!14 = metadata !{metadata !"", metadata !"/private/tmp"} -!15 = metadata !{metadata !"bug.c", metadata !"/private/tmp"} -!16 = metadata !{i32 0} -!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x2e\00foo\00foo\00foo\0053\000\001\000\006\000\000\000", !14, !1, !3, null, i32 ()* @foo, null, null, null} ; [ DW_TAG_subprogram ] +!1 = !{!"0x29", !14} ; [ DW_TAG_file_type ] +!2 = !{!"0x11\0012\00clang version 2.9 (trunk 114084)\000\00\000\00\000", !15, !16, !16, !13, null, null} ; [ DW_TAG_compile_unit ] +!3 = !{!"0x15\00\000\000\000\000\000\000", !14, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!4 = !{!5} +!5 = !{!"0x24\00int\000\0032\0032\000\000\005", !14, !1} ; [ DW_TAG_base_type ] +!6 = !{!"0x2e\00bar\00bar\00bar\004\000\001\000\006\000\000\000", !15, !7, !3, null, i32 ()* @bar, null, null, null} ; [ DW_TAG_subprogram ] +!7 = !{!"0x29", !15} ; [ DW_TAG_file_type ] +!8 = !MDLocation(line: 53, column: 13, scope: !9) +!9 = !{!"0xb\0053\0011\000", !14, !0} ; [ DW_TAG_lexical_block ] +!10 = !MDLocation(line: 4, column: 13, scope: !11) +!11 = !{!"0xb\004\0013\002", !15, !12} ; [ DW_TAG_lexical_block ] +!12 = !{!"0xb\004\0011\001", !15, !6} ; [ DW_TAG_lexical_block ] +!13 = !{!0, !6} +!14 = !{!"", !"/private/tmp"} +!15 = !{!"bug.c", !"/private/tmp"} +!16 = !{i32 0} +!17 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2010-09-16-asmcrash.ll b/test/CodeGen/X86/2010-09-16-asmcrash.ll index 9bbd6919421f..7aa9f32d41c4 100644 --- a/test/CodeGen/X86/2010-09-16-asmcrash.ll +++ b/test/CodeGen/X86/2010-09-16-asmcrash.ll @@ -53,4 +53,4 @@ return: ; preds = %while.end, %while.b ret void } -!0 = metadata !{i32 158484} +!0 = !{i32 158484} diff --git a/test/CodeGen/X86/2010-11-02-DbgParameter.ll b/test/CodeGen/X86/2010-11-02-DbgParameter.ll index 21ac7c9079e8..df3aa1f2ab37 100644 --- a/test/CodeGen/X86/2010-11-02-DbgParameter.ll +++ b/test/CodeGen/X86/2010-11-02-DbgParameter.ll @@ -9,32 +9,32 @@ target triple = "i386-apple-darwin11.0.0" define i32 @foo(%struct.bar* nocapture %i) nounwind readnone optsize noinline ssp { ; CHECK: TAG_formal_parameter entry: - tail call void @llvm.dbg.value(metadata !{%struct.bar* %i}, i64 0, metadata !6), !dbg !12 + tail call void @llvm.dbg.value(metadata %struct.bar* %i, i64 0, metadata !6, metadata !{!"0x102"}), !dbg !12 ret i32 1, !dbg !13 } -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!2} !llvm.module.flags = !{!19} -!0 = metadata !{i32 786478, metadata !17, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.bar*)* @foo, null, null, metadata !16, i32 3} ; [ DW_TAG_subprogram ] -!1 = metadata !{i32 786473, metadata !17} ; [ DW_TAG_file_type ] -!2 = metadata !{i32 786449, metadata !17, i32 12, metadata !"clang version 2.9 (trunk 117922)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, metadata !15, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!3 = metadata !{i32 786453, metadata !17, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!4 = metadata !{metadata !5} -!5 = metadata !{i32 786468, metadata !17, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!6 = metadata !{i32 786689, metadata !0, metadata !"i", metadata !1, i32 3, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ] -!7 = metadata !{i32 786447, metadata !17, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ] -!8 = metadata !{i32 786451, metadata !17, metadata !1, metadata !"bar", i32 2, i64 64, i64 32, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ] -!9 = metadata !{metadata !10, metadata !11} -!10 = metadata !{i32 786445, metadata !17, metadata !1, metadata !"x", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ] -!11 = metadata !{i32 786445, metadata !17, metadata !1, metadata !"y", i32 2, i64 32, i64 32, i64 32, i32 0, metadata !5} ; [ DW_TAG_member ] -!12 = metadata !{i32 3, i32 47, metadata !0, null} -!13 = metadata !{i32 4, i32 2, metadata !14, null} -!14 = metadata !{i32 786443, metadata !17, metadata !0, i32 3, i32 50, i32 0} ; [ DW_TAG_lexical_block ] -!15 = metadata !{metadata !0} -!16 = metadata !{metadata !6} -!17 = metadata !{metadata !"one.c", metadata !"/private/tmp"} -!18 = metadata !{i32 0} -!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x2e\00foo\00foo\00\003\000\001\000\006\00256\001\003", !17, !1, !3, null, i32 (%struct.bar*)* @foo, null, null, !16} ; [ DW_TAG_subprogram ] +!1 = !{!"0x29", !17} ; [ DW_TAG_file_type ] +!2 = !{!"0x11\0012\00clang version 2.9 (trunk 117922)\001\00\000\00\000", !17, !18, !18, !15, null, null} ; [ DW_TAG_compile_unit ] +!3 = !{!"0x15\00\000\000\000\000\000\000", !17, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!4 = !{!5} +!5 = !{!"0x24\00int\000\0032\0032\000\000\005", !17, !2} ; [ DW_TAG_base_type ] +!6 = !{!"0x101\00i\003\000", !0, !1, !7} ; [ DW_TAG_arg_variable ] +!7 = !{!"0xf\00\000\0032\0032\000\000", !17, !1, !8} ; [ DW_TAG_pointer_type ] +!8 = !{!"0x13\00bar\002\0064\0032\000\000\000", !17, !1, null, !9, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ] +!9 = !{!10, !11} +!10 = !{!"0xd\00x\002\0032\0032\000\000", !17, !1, !5} ; [ DW_TAG_member ] +!11 = !{!"0xd\00y\002\0032\0032\0032\000", !17, !1, !5} ; [ DW_TAG_member ] +!12 = !MDLocation(line: 3, column: 47, scope: !0) +!13 = !MDLocation(line: 4, column: 2, scope: !14) +!14 = !{!"0xb\003\0050\000", !17, !0} ; [ DW_TAG_lexical_block ] +!15 = !{!0} +!16 = !{!6} +!17 = !{!"one.c", !"/private/tmp"} +!18 = !{i32 0} +!19 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll index 625a35161c11..8404020c91f1 100644 --- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll +++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll @@ -22,8 +22,8 @@ target triple = "x86_64-apple-darwin10.0.0" define i64 @gcd(i64 %a, i64 %b) nounwind readnone optsize noinline ssp { entry: - tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !10), !dbg !18 - tail call void @llvm.dbg.value(metadata !{i64 %b}, i64 0, metadata !11), !dbg !19 + tail call void @llvm.dbg.value(metadata i64 %a, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !18 + tail call void @llvm.dbg.value(metadata i64 %b, i64 0, metadata !11, metadata !{!"0x102"}), !dbg !19 br label %while.body, !dbg !20 while.body: ; preds = %while.body, %entry @@ -34,14 +34,14 @@ while.body: ; preds = %while.body, %entry br i1 %cmp, label %if.then, label %while.body, !dbg !23 if.then: ; preds = %while.body - tail call void @llvm.dbg.value(metadata !{i64 %rem}, i64 0, metadata !12), !dbg !21 + tail call void @llvm.dbg.value(metadata i64 %rem, i64 0, metadata !12, metadata !{!"0x102"}), !dbg !21 ret i64 %b.addr.0, !dbg !23 } define i32 @main() nounwind optsize ssp { entry: %call = tail call i32 @rand() nounwind optsize, !dbg !24 - tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !14), !dbg !24 + tail call void @llvm.dbg.value(metadata i32 %call, i64 0, metadata !14, metadata !{!"0x102"}), !dbg !24 %cmp = icmp ugt i32 %call, 21, !dbg !25 br i1 %cmp, label %cond.true, label %cond.end, !dbg !25 @@ -51,7 +51,7 @@ cond.true: ; preds = %entry cond.end: ; preds = %entry, %cond.true %cond = phi i32 [ %call1, %cond.true ], [ %call, %entry ], !dbg !25 - tail call void @llvm.dbg.value(metadata !{i32 %cond}, i64 0, metadata !17), !dbg !25 + tail call void @llvm.dbg.value(metadata i32 %cond, i64 0, metadata !17, metadata !{!"0x102"}), !dbg !25 %conv = sext i32 %cond to i64, !dbg !26 %conv5 = zext i32 %call to i64, !dbg !26 %call6 = tail call i64 @gcd(i64 %conv, i64 %conv5) optsize, !dbg !26 @@ -71,44 +71,44 @@ declare i32 @rand() optsize declare i32 @printf(i8* nocapture, ...) nounwind optsize -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone declare i32 @puts(i8* nocapture) nounwind !llvm.dbg.cu = !{!2} !llvm.module.flags = !{!33} -!0 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"gcd", metadata !"gcd", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i64 (i64, i64)* @gcd, null, null, metadata !29, i32 0} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd] -!1 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ] -!2 = metadata !{i32 786449, metadata !31, i32 12, metadata !"clang version 2.9 (trunk 124117)", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !28, null, null, null} ; [ DW_TAG_compile_unit ] -!3 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!4 = metadata !{metadata !5} -!5 = metadata !{i32 786468, null, metadata !2, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!6 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 25, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @main, null, null, metadata !30, i32 0} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main] -!7 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!8 = metadata !{metadata !9} -!9 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!10 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ] -!11 = metadata !{i32 786689, metadata !0, metadata !"b", metadata !1, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ] -!12 = metadata !{i32 786688, metadata !13, metadata !"c", metadata !1, i32 6, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ] -!13 = metadata !{i32 786443, metadata !31, metadata !0, i32 5, i32 52, i32 0} ; [ DW_TAG_lexical_block ] -!14 = metadata !{i32 786688, metadata !15, metadata !"m", metadata !1, i32 26, metadata !16, i32 0, null} ; [ DW_TAG_auto_variable ] -!15 = metadata !{i32 786443, metadata !31, metadata !6, i32 25, i32 12, i32 2} ; [ DW_TAG_lexical_block ] -!16 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] -!17 = metadata !{i32 786688, metadata !15, metadata !"z_s", metadata !1, i32 27, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ] -!18 = metadata !{i32 5, i32 41, metadata !0, null} -!19 = metadata !{i32 5, i32 49, metadata !0, null} -!20 = metadata !{i32 7, i32 5, metadata !13, null} -!21 = metadata !{i32 8, i32 9, metadata !22, null} -!22 = metadata !{i32 786443, metadata !31, metadata !13, i32 7, i32 14, i32 1} ; [ DW_TAG_lexical_block ] -!23 = metadata !{i32 9, i32 9, metadata !22, null} -!24 = metadata !{i32 26, i32 38, metadata !15, null} -!25 = metadata !{i32 27, i32 38, metadata !15, null} -!26 = metadata !{i32 28, i32 9, metadata !15, null} -!27 = metadata !{i32 30, i32 1, metadata !15, null} -!28 = metadata !{metadata !0, metadata !6} -!29 = metadata !{metadata !10, metadata !11, metadata !12} -!30 = metadata !{metadata !14, metadata !17} -!31 = metadata !{metadata !"rem_small.c", metadata !"/private/tmp"} -!32 = metadata !{i32 0} -!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x2e\00gcd\00gcd\00\005\000\001\000\006\00256\001\000", !31, !1, !3, null, i64 (i64, i64)* @gcd, null, null, !29} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd] +!1 = !{!"0x29", !31} ; [ DW_TAG_file_type ] +!2 = !{!"0x11\0012\00clang version 2.9 (trunk 124117)\001\00\000\00\001", !31, !32, !32, !28, null, null} ; [ DW_TAG_compile_unit ] +!3 = !{!"0x15\00\000\000\000\000\000\000", !31, !1, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!4 = !{!5} +!5 = !{!"0x24\00long int\000\0064\0064\000\000\005", null, !2} ; [ DW_TAG_base_type ] +!6 = !{!"0x2e\00main\00main\00\0025\000\001\000\006\000\001\000", !31, !1, !7, null, i32 ()* @main, null, null, !30} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main] +!7 = !{!"0x15\00\000\000\000\000\000\000", !31, !1, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!8 = !{!9} +!9 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !2} ; [ DW_TAG_base_type ] +!10 = !{!"0x101\00a\005\000", !0, !1, !5} ; [ DW_TAG_arg_variable ] +!11 = !{!"0x101\00b\005\000", !0, !1, !5} ; [ DW_TAG_arg_variable ] +!12 = !{!"0x100\00c\006\000", !13, !1, !5} ; [ DW_TAG_auto_variable ] +!13 = !{!"0xb\005\0052\000", !31, !0} ; [ DW_TAG_lexical_block ] +!14 = !{!"0x100\00m\0026\000", !15, !1, !16} ; [ DW_TAG_auto_variable ] +!15 = !{!"0xb\0025\0012\002", !31, !6} ; [ DW_TAG_lexical_block ] +!16 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, !2} ; [ DW_TAG_base_type ] +!17 = !{!"0x100\00z_s\0027\000", !15, !1, !9} ; [ DW_TAG_auto_variable ] +!18 = !MDLocation(line: 5, column: 41, scope: !0) +!19 = !MDLocation(line: 5, column: 49, scope: !0) +!20 = !MDLocation(line: 7, column: 5, scope: !13) +!21 = !MDLocation(line: 8, column: 9, scope: !22) +!22 = !{!"0xb\007\0014\001", !31, !13} ; [ DW_TAG_lexical_block ] +!23 = !MDLocation(line: 9, column: 9, scope: !22) +!24 = !MDLocation(line: 26, column: 38, scope: !15) +!25 = !MDLocation(line: 27, column: 38, scope: !15) +!26 = !MDLocation(line: 28, column: 9, scope: !15) +!27 = !MDLocation(line: 30, column: 1, scope: !15) +!28 = !{!0, !6} +!29 = !{!10, !11, !12} +!30 = !{!14, !17} +!31 = !{!"rem_small.c", !"/private/tmp"} +!32 = !{i32 0} +!33 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll b/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll index 445fc01231e4..b764da1ab585 100644 --- a/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll +++ b/test/CodeGen/X86/2011-06-14-mmx-inlineasm.ll @@ -41,5 +41,5 @@ entry: declare void @llvm.x86.mmx.emms() nounwind -!0 = metadata !{i32 888, i32 917, i32 945, i32 973, i32 1001, i32 1029, i32 1057} -!1 = metadata !{i32 1390, i32 1430, i32 1469, i32 1508, i32 1547, i32 1586, i32 1625, i32 1664} +!0 = !{i32 888, i32 917, i32 945, i32 973, i32 1001, i32 1029, i32 1057} +!1 = !{i32 1390, i32 1430, i32 1469, i32 1508, i32 1547, i32 1586, i32 1625, i32 1664} diff --git a/test/CodeGen/X86/2011-08-29-InitOrder.ll b/test/CodeGen/X86/2011-08-29-InitOrder.ll index a95dcb580702..b278ad674152 100644 --- a/test/CodeGen/X86/2011-08-29-InitOrder.ll +++ b/test/CodeGen/X86/2011-08-29-InitOrder.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck %s --check-prefix=CHECK-DEFAULT +; RUN: llc < %s -mtriple=i386-linux-gnu -use-ctors | FileCheck %s --check-prefix=CHECK-DEFAULT ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s --check-prefix=CHECK-DARWIN ; PR5329 diff --git a/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll b/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll index cd8a16f5732a..b78c13f9d4e6 100644 --- a/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll +++ b/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=i686-linux -mattr=-sse | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=i686-linux -mattr=-sse | FileCheck %s ; PR11768 @ptr = external global i8* diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll index 16706ae957f2..6651af705551 100644 --- a/test/CodeGen/X86/2012-04-26-sdglue.ll +++ b/test/CodeGen/X86/2012-04-26-sdglue.ll @@ -8,7 +8,7 @@ ;CHECK: vpxor ;CHECK: vinserti128 ;CHECK: vpshufd -;CHECK: vpshufd +;CHECK: vpbroadcastd ;CHECK: vmulps ;CHECK: vmulps ;CHECK: ret diff --git a/test/CodeGen/X86/2012-05-19-avx2-store.ll b/test/CodeGen/X86/2012-05-19-avx2-store.ll deleted file mode 100644 index 1c1e8e2f0a21..000000000000 --- a/test/CodeGen/X86/2012-05-19-avx2-store.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s - -define void @double_save(<4 x i32>* %Ap, <4 x i32>* %Bp, <8 x i32>* %P) nounwind ssp { -entry: - ; CHECK: vmovaps - ; CHECK: vinsertf128 $1, ([[A0:%rdi|%rsi]]), - ; CHECK: vmovups - %A = load <4 x i32>* %Ap - %B = load <4 x i32>* %Bp - %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - store <8 x i32> %Z, <8 x i32>* %P, align 16 - ret void -} diff --git a/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/test/CodeGen/X86/2012-07-15-broadcastfold.ll index 1c39c747cdc8..519c7cac736f 100644 --- a/test/CodeGen/X86/2012-07-15-broadcastfold.ll +++ b/test/CodeGen/X86/2012-07-15-broadcastfold.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 | FileCheck %s +; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s declare x86_fastcallcc i64 @barrier() diff --git a/test/CodeGen/X86/2012-10-02-DAGCycle.ll b/test/CodeGen/X86/2012-10-02-DAGCycle.ll index 8d914db3315f..403d21ae9733 100644 --- a/test/CodeGen/X86/2012-10-02-DAGCycle.ll +++ b/test/CodeGen/X86/2012-10-02-DAGCycle.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s -; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s +; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s > /dev/null +; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s > /dev/null ; rdar://12393897 diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll index 62ee1e15fda0..c33b48dfecb5 100644 --- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll +++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll @@ -12,11 +12,11 @@ %struct.hgstruct.2.29 = type { %struct.bnode.1.28*, [3 x double], double, [3 x double] } %struct.bnode.1.28 = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode.1.28*, %struct.bnode.1.28* } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp { entry: - call void @llvm.dbg.declare(metadata !{%struct.hgstruct.2.29* %hg}, metadata !4) + call void @llvm.dbg.declare(metadata %struct.hgstruct.2.29* %hg, metadata !4, metadata !{!"0x102"}) %type = getelementptr inbounds %struct.node.0.27* %p, i64 0, i32 0 %0 = load i16* %type, align 2 %cmp = icmp eq i16 %0, 1 @@ -33,16 +33,20 @@ return: ; preds = %for.cond.preheader, ret i16 %retval.0 } -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!12} -!0 = metadata !{i32 786449, metadata !11, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99] -!2 = metadata !{} -!4 = metadata !{i32 786689, null, metadata !"hg", metadata !5, i32 67109589, metadata !6, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [hg] [line 725] -!5 = metadata !{i32 786473, metadata !11} ; [ DW_TAG_file_type ] -!6 = metadata !{i32 786454, metadata !11, null, metadata !"hgstruct", i32 492, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ] -!7 = metadata !{i32 786451, metadata !11, null, metadata !"", i32 487, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ] -!11 = metadata !{metadata !"MultiSource/Benchmarks/Olden/bh/newbh.c", metadata !"MultiSource/Benchmarks/Olden/bh"} -!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", !11, !2, !2, !13, !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99] +!2 = !{} +!4 = !{!"0x101\00hg\0067109589\000", null, !5, !6} ; [ DW_TAG_arg_variable ] [hg] [line 725] +!5 = !{!"0x29", !11} ; [ DW_TAG_file_type ] +!6 = !{!"0x16\00hgstruct\00492\000\000\000\000", !11, null, !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ] +!7 = !{!"0x13\00\00487\00512\0064\000\000\000", !11, null, null, null, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ] +!11 = !{!"MultiSource/Benchmarks/Olden/bh/newbh.c", !"MultiSource/Benchmarks/Olden/bh"} +!12 = !{i32 1, !"Debug Info Version", i32 2} +!13 = !{!14} +!14 = !{!"0x2e\00subdivp\00subdivp\00\000\000\001\000\006\00256\001\001", !11, !5, !15, null, i16 (%struct.node.0.27*, double, double, %struct.hgstruct.2.29* )* @subdivp, null, null, null} ; [ DW_TAG_subprogram ] [def] [subdivp] +!15 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!16 = !{null} diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll index 36667def6110..28ceb2fad2fc 100644 --- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll +++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll @@ -12,7 +12,7 @@ @.str15 = external hidden unnamed_addr constant [6 x i8], align 1 -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone define i32 @AttachGalley(%union.rec** nocapture %suspend_pt) nounwind uwtable ssp { entry: @@ -43,7 +43,7 @@ if.then3344: br label %if.then4073 if.then4073: ; preds = %if.then3344 - call void @llvm.dbg.declare(metadata !{[20 x i8]* %num14075}, metadata !4) + call void @llvm.dbg.declare(metadata [20 x i8]* %num14075, metadata !4, metadata !{!"0x102"}) %arraydecay4078 = getelementptr inbounds [20 x i8]* %num14075, i64 0, i64 0 %0 = load i32* undef, align 4 %add4093 = add nsw i32 %0, 0 @@ -65,25 +65,30 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!35} -!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99] -!1 = metadata !{metadata !2} -!2 = metadata !{} -!4 = metadata !{i32 786688, metadata !5, metadata !"num1", metadata !14, i32 815, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [num1] [line 815] -!5 = metadata !{i32 786443, metadata !14, metadata !6, i32 815, i32 0, i32 177} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!6 = metadata !{i32 786443, metadata !14, metadata !7, i32 812, i32 0, i32 176} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!7 = metadata !{i32 786443, metadata !14, metadata !8, i32 807, i32 0, i32 175} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!8 = metadata !{i32 786443, metadata !14, metadata !9, i32 440, i32 0, i32 94} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!9 = metadata !{i32 786443, metadata !14, metadata !10, i32 435, i32 0, i32 91} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!10 = metadata !{i32 786443, metadata !14, metadata !11, i32 434, i32 0, i32 90} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!11 = metadata !{i32 786443, metadata !14, metadata !12, i32 250, i32 0, i32 24} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!12 = metadata !{i32 786443, metadata !14, metadata !13, i32 249, i32 0, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!13 = metadata !{i32 786443, metadata !14, metadata !2, i32 221, i32 0, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] -!14 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ] -!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char] -!16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char] -!17 = metadata !{metadata !18} -!18 = metadata !{i32 786465, i64 0, i64 20} ; [ DW_TAG_subrange_type ] [0, 19] -!19 = metadata !{metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", metadata !"MultiSource/Benchmarks/MiBench/consumer-typeset"} +!0 = !{!"0x11\0012\00clang version 3.3 (trunk 168918) (llvm/trunk 168920)\001\00\000\00\000", !19, !2, !2, !20, !2, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99] +!1 = !{!2} +!2 = !{} +!4 = !{!"0x100\00num1\00815\000", !5, !14, !15} ; [ DW_TAG_auto_variable ] [num1] [line 815] +!5 = !{!"0xb\00815\000\00177", !14, !6} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!6 = !{!"0xb\00812\000\00176", !14, !7} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!7 = !{!"0xb\00807\000\00175", !14, !8} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!8 = !{!"0xb\00440\000\0094", !14, !9} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!9 = !{!"0xb\00435\000\0091", !14, !10} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!10 = !{!"0xb\00434\000\0090", !14, !11} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!11 = !{!"0xb\00250\000\0024", !14, !12} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!12 = !{!"0xb\00249\000\0023", !14, !13} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!13 = !{!"0xb\00221\000\0019", !14, !2} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] +!14 = !{!"0x29", !19} ; [ DW_TAG_file_type ] +!15 = !{!"0x1\00\000\00160\008\000\000", null, null, !16, !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char] +!16 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char] +!17 = !{!18} +!18 = !{!"0x21\000\0020"} ; [ DW_TAG_subrange_type ] [0, 19] +!19 = !{!"MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", !"MultiSource/Benchmarks/MiBench/consumer-typeset"} + +!20 = !{!21} +!21 = !{!"0x2e\00AttachGalley\00AttachGalley\00\000\000\001\000\006\00256\001\001", !19, !14, !22, null, i32 (%union.rec**)* @AttachGalley, null, null, null} ; [ DW_TAG_subprogram ] [def] [AttachGalley] +!22 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !23, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!23 = !{null} ; Test DebugValue uses visited by RegisterPressureTracker findUseBetween(). ; @@ -103,7 +108,7 @@ cond.true: ; preds = %entry unreachable cond.end: ; preds = %entry - call void @llvm.dbg.declare(metadata !{%"class.__gnu_cxx::hash_map"* %X}, metadata !31) + call void @llvm.dbg.declare(metadata %"class.__gnu_cxx::hash_map"* %X, metadata !31, metadata !{!"0x102"}) %_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5 invoke void @_Znwm() to label %exit.i unwind label %lpad2.i.i.i.i @@ -129,9 +134,11 @@ declare void @_Znwm() !llvm.dbg.cu = !{!30} -!30 = metadata !{i32 786449, metadata !34, i32 4, metadata !"clang version 3.3 (trunk 169129) (llvm/trunk 169135)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus] -!31 = metadata !{i32 786688, null, metadata !"X", null, i32 29, metadata !32, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [X] [line 29] -!32 = metadata !{i32 786454, metadata !34, null, metadata !"HM", i32 28, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ] -!33 = metadata !{i32 786473, metadata !34} ; [ DW_TAG_file_type ] -!34 = metadata !{metadata !"SingleSource/Benchmarks/Shootout-C++/hash.cpp", metadata !"SingleSource/Benchmarks/Shootout-C++"} -!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!30 = !{!"0x11\004\00clang version 3.3 (trunk 169129) (llvm/trunk 169135)\001\00\000\00\000", !34, !2, !2, !36, null, null} ; [ DW_TAG_compile_unit ] [SingleSource/Benchmarks/Shootout-C++/hash.cpp] [DW_LANG_C_plus_plus] +!31 = !{!"0x100\00X\0029\000", null, null, !32} ; [ DW_TAG_auto_variable ] [X] [line 29] +!32 = !{!"0x16\00HM\0028\000\000\000\000", !34, null, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ] +!33 = !{!"0x29", !34} ; [ DW_TAG_file_type ] +!34 = !{!"SingleSource/Benchmarks/Shootout-C++/hash.cpp", !"SingleSource/Benchmarks/Shootout-C++"} +!35 = !{i32 1, !"Debug Info Version", i32 2} +!36 = !{!37} +!37 = !{!"0x2e\00main\00main\00\000\000\001\000\006\00256\001\001", !19, !14, !22, null, void ()* @main, null, null, null} ; [ DW_TAG_subprogram ] [def] [main] diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll index 5aec3d92c70f..04b31749ce58 100644 --- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll +++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll @@ -9,7 +9,7 @@ %struct.btCompoundLeafCallback = type { i32, i32 } -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone define void @test() unnamed_addr uwtable ssp align 2 { entry: @@ -20,7 +20,7 @@ if.then: ; preds = %entry unreachable if.end: ; preds = %entry - call void @llvm.dbg.declare(metadata !{%struct.btCompoundLeafCallback* %callback}, metadata !3) + call void @llvm.dbg.declare(metadata %struct.btCompoundLeafCallback* %callback, metadata !3, metadata !{!"0x102"}) %m = getelementptr inbounds %struct.btCompoundLeafCallback* %callback, i64 0, i32 1 store i32 0, i32* undef, align 8 %cmp12447 = icmp sgt i32 undef, 0 @@ -36,11 +36,13 @@ invoke.cont44: ; preds = %if.end !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!8} -!0 = metadata !{i32 786449, metadata !6, i32 4, metadata !"clang version 3.3 (trunk 168984) (llvm/trunk 168983)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus] -!2 = metadata !{null} -!3 = metadata !{i32 786688, null, metadata !"callback", null, i32 214, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [callback] [line 214] -!4 = metadata !{i32 786451, metadata !6, null, metadata !"btCompoundLeafCallback", i32 90, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ] -!5 = metadata !{i32 786473, metadata !6} ; [ DW_TAG_file_type ] -!6 = metadata !{metadata !"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", metadata !"MultiSource/Benchmarks/Bullet"} -!7 = metadata !{i32 0} -!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x11\004\00clang version 3.3 (trunk 168984) (llvm/trunk 168983)\001\00\000\00\000", !6, null, null, !1, null, null} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus] +!1 = !{!2} +!2 = !{!"0x2e\00test\00test\00\000\000\001\000\006\00256\001\001", !6, !5, !7, null, void ()* @test, null, null, null} ; [ DW_TAG_subprogram ] [def] [test] +!3 = !{!"0x100\00callback\00214\000", null, null, !4} ; [ DW_TAG_auto_variable ] [callback] [line 214] +!4 = !{!"0x13\00btCompoundLeafCallback\0090\00512\0064\000\000\000", !6, null, null, null, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ] +!5 = !{!"0x29", !6} ; [ DW_TAG_file_type ] +!6 = !{!"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", !"MultiSource/Benchmarks/Bullet"} +!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !9, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!8 = !{i32 1, !"Debug Info Version", i32 2} +!9 = !{null} diff --git a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll index bbba796eed24..9cd150a2f56d 100644 --- a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll +++ b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll @@ -6,7 +6,7 @@ ; we may reference variables that were not live across basic blocks ; resulting in undefined virtual registers. ; -; In this example, this is illustrated by a the spill/reload of the +; In this example, this is illustrated by a spill/reload of the ; LOADED_PTR_SLOT. ; ; Before this patch, the compiler was accessing two different spill @@ -41,7 +41,7 @@ entry: i1 false, label %label_end ] default: - unreachable + br label %label_end label_true: br label %label_end @@ -80,7 +80,7 @@ entry: i1 false, label %label_end ] default: - unreachable + br label %label_end label_true: br label %label_end @@ -119,7 +119,7 @@ entry: i1 false, label %label_end ] default: - unreachable + br label %label_end label_true: br label %label_end diff --git a/test/CodeGen/X86/2014-08-29-CompactUnwind.ll b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll new file mode 100644 index 000000000000..f65d7c9d2e05 --- /dev/null +++ b/test/CodeGen/X86/2014-08-29-CompactUnwind.ll @@ -0,0 +1,46 @@ +; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 -filetype=obj -o - | llvm-objdump -d -unwind-info -s - | FileCheck %s +; Regression test for http://llvm.org/bugs/show_bug.cgi?id=20800. + +; ModuleID = 'asan_report.ii' +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.9.0" + +@.str = private unnamed_addr constant [3 x i8] c"=>\00", align 1 +@.str1 = private unnamed_addr constant [3 x i8] c" \00", align 1 +@.str2 = private unnamed_addr constant [6 x i8] c"%s%p:\00", align 1 + +; CHECK: ___asan_report_error: + +; subq instruction starts at 0x0a, so the second byte of the compact encoding +; (UNWIND_X86_64_FRAMELESS_STACK_SIZE in mach-o/compact_unwind_encoding.h) +; must be 0x0d. +; CHECK: {{a:.*subq.*%rsp}} + +; CHECK: Contents of __compact_unwind section +; CHECK: ___asan_report_error + +; Because of incorrect push instruction size in X86AsmBackend.cpp the stack +; size was also calculated incorrectly. +; CHECK-NOT: {{compact encoding:.*0x0309f800}} +; CHECK: {{compact encoding:.*0x030df800}} + +define void @__asan_report_error() #0 { + %str.i = alloca i64, align 8 + %stack = alloca [256 x i64], align 8 + br label %print_shadow_bytes.exit.i + +print_shadow_bytes.exit.i: ; preds = %print_shadow_bytes.exit.i, %0 + %iv.i = phi i64 [ -5, %0 ], [ %iv.next.i, %print_shadow_bytes.exit.i ] + %reg15 = icmp eq i64 %iv.i, 0 + %.str..str1.i = select i1 %reg15, [3 x i8]* @.str, [3 x i8]* @.str1 + %reg16 = getelementptr inbounds [3 x i8]* %.str..str1.i, i64 0, i64 0 + %reg17 = shl i64 %iv.i, 1 + %reg19 = inttoptr i64 %reg17 to i8* + call void (i64*, i8*, ...)* @append(i64* %str.i, i8* getelementptr inbounds ([6 x i8]* @.str2, i64 0, i64 0), i8* %reg16, i8* %reg19) + %iv.next.i = add nsw i64 %iv.i, 0 + br label %print_shadow_bytes.exit.i +} + +declare void @append(i64*, i8*, ...) + +attributes #0 = { "no-frame-pointer-elim"="false" } diff --git a/test/CodeGen/X86/MachineBranchProb.ll b/test/CodeGen/X86/MachineBranchProb.ll index a8931527ea6d..cf41ef2ea3ad 100644 --- a/test/CodeGen/X86/MachineBranchProb.ll +++ b/test/CodeGen/X86/MachineBranchProb.ll @@ -31,4 +31,4 @@ for.inc20: ; preds = %for.cond2 ret void } -!0 = metadata !{metadata !"branch_weights", i32 112017436, i32 -735157296} +!0 = !{!"branch_weights", i32 112017436, i32 -735157296} diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll index 4ce2fb3dcafb..3a2c58f97e8c 100644 --- a/test/CodeGen/X86/MachineSink-DbgValue.ll +++ b/test/CodeGen/X86/MachineSink-DbgValue.ll @@ -4,10 +4,10 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 target triple = "x86_64-apple-macosx10.7.0" define i32 @foo(i32 %i, i32* nocapture %c) nounwind uwtable readonly ssp { - tail call void @llvm.dbg.value(metadata !{i32 %i}, i64 0, metadata !6), !dbg !12 + tail call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !6, metadata !{!"0x102"}), !dbg !12 %ab = load i32* %c, align 1, !dbg !14 - tail call void @llvm.dbg.value(metadata !{i32* %c}, i64 0, metadata !7), !dbg !13 - tail call void @llvm.dbg.value(metadata !{i32 %ab}, i64 0, metadata !10), !dbg !14 + tail call void @llvm.dbg.value(metadata i32* %c, i64 0, metadata !7, metadata !{!"0x102"}), !dbg !13 + tail call void @llvm.dbg.value(metadata i32 %ab, i64 0, metadata !10, metadata !{!"0x102"}), !dbg !14 %cd = icmp eq i32 %i, 42, !dbg !15 br i1 %cd, label %bb1, label %bb2, !dbg !15 @@ -23,31 +23,31 @@ bb2: ret i32 %.0, !dbg !17 } -declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!22} -!0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, metadata !18, null, null, null} ; [ DW_TAG_compile_unit ] -!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i32*)* @foo, null, null, metadata !19, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo] -!2 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ] -!3 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!4 = metadata !{metadata !5} -!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!6 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777218, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ] -!7 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 33554434, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ] -!8 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ] -!9 = metadata !{i32 786468, null, metadata !0, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] -!10 = metadata !{i32 786688, metadata !11, metadata !"a", metadata !2, i32 3, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ] -!11 = metadata !{i32 786443, metadata !20, metadata !1, i32 2, i32 25, i32 0} ; [ DW_TAG_lexical_block ] -!12 = metadata !{i32 2, i32 13, metadata !1, null} -!13 = metadata !{i32 2, i32 22, metadata !1, null} -!14 = metadata !{i32 3, i32 14, metadata !11, null} -!15 = metadata !{i32 4, i32 3, metadata !11, null} -!16 = metadata !{i32 5, i32 5, metadata !11, null} -!17 = metadata !{i32 7, i32 1, metadata !11, null} -!18 = metadata !{metadata !1} -!19 = metadata !{metadata !6, metadata !7, metadata !10} -!20 = metadata !{metadata !"a.c", metadata !"/private/tmp"} -!21 = metadata !{i32 0} -!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x11\0012\00Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)\001\00\000\00\001", !20, !21, !21, !18, null, null} ; [ DW_TAG_compile_unit ] +!1 = !{!"0x2e\00foo\00foo\00\002\000\001\000\006\00256\001\000", !20, !2, !3, null, i32 (i32, i32*)* @foo, null, null, !19} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo] +!2 = !{!"0x29", !20} ; [ DW_TAG_file_type ] +!3 = !{!"0x15\00\000\000\000\000\000\000", !20, !2, null, !4, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!4 = !{!5} +!5 = !{!"0x24\00int\000\0032\0032\000\000\005", null, !0} ; [ DW_TAG_base_type ] +!6 = !{!"0x101\00i\0016777218\000", !1, !2, !5} ; [ DW_TAG_arg_variable ] +!7 = !{!"0x101\00c\0033554434\000", !1, !2, !8} ; [ DW_TAG_arg_variable ] +!8 = !{!"0xf\00\000\0064\0064\000\000", null, !0, !9} ; [ DW_TAG_pointer_type ] +!9 = !{!"0x24\00char\000\008\008\000\000\006", null, !0} ; [ DW_TAG_base_type ] +!10 = !{!"0x100\00a\003\000", !11, !2, !9} ; [ DW_TAG_auto_variable ] +!11 = !{!"0xb\002\0025\000", !20, !1} ; [ DW_TAG_lexical_block ] +!12 = !MDLocation(line: 2, column: 13, scope: !1) +!13 = !MDLocation(line: 2, column: 22, scope: !1) +!14 = !MDLocation(line: 3, column: 14, scope: !11) +!15 = !MDLocation(line: 4, column: 3, scope: !11) +!16 = !MDLocation(line: 5, column: 5, scope: !11) +!17 = !MDLocation(line: 7, column: 1, scope: !11) +!18 = !{!1} +!19 = !{!6, !7, !10} +!20 = !{!"a.c", !"/private/tmp"} +!21 = !{i32 0} +!22 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll index f6d68520b7b4..dfdaea523fdf 100644 --- a/test/CodeGen/X86/MergeConsecutiveStores.ll +++ b/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -148,12 +148,12 @@ define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) n } -;CHECK-LABEL: merge_loads_i16: -; load: -;CHECK: movw -; store: -;CHECK: movw -;CHECK: ret +; CHECK-LABEL: merge_loads_i16: +; load: +; CHECK: movw +; store: +; CHECK: movw +; CHECK: ret define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp { %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -181,13 +181,13 @@ define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struc ret void } -; The loads and the stores are interleved. Can't merge them. -;CHECK-LABEL: no_merge_loads: -;CHECK: movb -;CHECK: movb -;CHECK: movb -;CHECK: movb -;CHECK: ret +; The loads and the stores are interleaved. Can't merge them. +; CHECK-LABEL: no_merge_loads: +; CHECK: movb +; CHECK: movb +; CHECK: movb +; CHECK: movb +; CHECK: ret define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp { %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -216,12 +216,12 @@ a4: ; preds = %4, %.lr.ph } -;CHECK-LABEL: merge_loads_integer: -; load: -;CHECK: movq -; store: -;CHECK: movq -;CHECK: ret +; CHECK-LABEL: merge_loads_integer: +; load: +; CHECK: movq +; store: +; CHECK: movq +; CHECK: ret define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { %1 = icmp sgt i32 %count, 0 br i1 %1, label %.lr.ph, label %._crit_edge @@ -250,12 +250,12 @@ define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %s } -;CHECK-LABEL: merge_loads_vector: -; load: -;CHECK: movups -; store: -;CHECK: movups -;CHECK: ret +; CHECK-LABEL: merge_loads_vector: +; load: +; CHECK: movups +; store: +; CHECK: movups +; CHECK: ret define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { %a1 = icmp sgt i32 %count, 0 br i1 %a1, label %.lr.ph, label %._crit_edge @@ -291,18 +291,18 @@ block4: ; preds = %4, %.lr.ph ret void } -;CHECK-LABEL: merge_loads_no_align: -; load: -;CHECK: movl -;CHECK: movl -;CHECK: movl -;CHECK: movl -; store: -;CHECK: movl -;CHECK: movl -;CHECK: movl -;CHECK: movl -;CHECK: ret +; CHECK-LABEL: merge_loads_no_align: +; load: +; CHECK: movl +; CHECK: movl +; CHECK: movl +; CHECK: movl +; store: +; CHECK: movl +; CHECK: movl +; CHECK: movl +; CHECK: movl +; CHECK: ret define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp { %a1 = icmp sgt i32 %count, 0 br i1 %a1, label %.lr.ph, label %._crit_edge diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll index 51d0d1775c67..498ad7edaa9d 100644 --- a/test/CodeGen/X86/StackColoring-dbg.ll +++ b/test/CodeGen/X86/StackColoring-dbg.ll @@ -5,7 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone define void @foo() nounwind uwtable ssp { entry: @@ -17,7 +17,7 @@ entry: for.body: call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind call void @llvm.lifetime.start(i64 -1, i8* %x.i) nounwind - call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22) nounwind + call void @llvm.dbg.declare(metadata i8* %x.i, metadata !22, metadata !{!"0x102"}) nounwind br label %for.body } @@ -27,9 +27,9 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!23} -!0 = metadata !{i32 524305, metadata !1, i32 1, metadata !"clang", i1 true, metadata !"", i32 0, metadata !2, metadata !2, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!1 = metadata !{metadata !"t.c", metadata !""} -!16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} -!2 = metadata !{i32 0} -!22 = metadata !{i32 786688, null, metadata !"x", metadata !2, i32 16, metadata !16, i32 0, i32 0} -!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x11\001\00clang\001\00\000\00\000", !1, !2, !2, null, null, null} ; [ DW_TAG_compile_unit ] +!1 = !{!"t.c", !""} +!16 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] +!2 = !{i32 0} +!22 = !{!"0x100\00x\0016\000", null, !2, !16} ; [ DW_TAG_auto_variable ] +!23 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/SwizzleShuff.ll b/test/CodeGen/X86/SwizzleShuff.ll index 100817a676e8..a435272dca44 100644 --- a/test/CodeGen/X86/SwizzleShuff.ll +++ b/test/CodeGen/X86/SwizzleShuff.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s ; Check that we perform a scalar XOR on i32. diff --git a/test/CodeGen/X86/TruncAssertZext.ll b/test/CodeGen/X86/TruncAssertZext.ll new file mode 100644 index 000000000000..8c664127f92a --- /dev/null +++ b/test/CodeGen/X86/TruncAssertZext.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -O2 -march=x86-64 | FileCheck %s +; Checks that a zeroing mov is inserted for the trunc/zext pair even when +; the source of the zext is an AssertSext node +; PR20494 + +define i64 @main(i64 %a) { +; CHECK-LABEL: main +; CHECK: movl %e{{..}}, %eax +; CHECK: ret + %or = or i64 %a, -2 + %trunc = trunc i64 %or to i32 + br label %l +l: + %ext = zext i32 %trunc to i64 + ret i64 %ext +} diff --git a/test/CodeGen/X86/add_shl_constant.ll b/test/CodeGen/X86/add_shl_constant.ll new file mode 100644 index 000000000000..33074e4780e6 --- /dev/null +++ b/test/CodeGen/X86/add_shl_constant.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin < %s | FileCheck %s + +; CHECK-LABEL: add_shl_add_constant_1_i32 +; CHECK: leal 984(%rsi,%rdi,8), %eax +; CHECK-NEXT: retq +define i32 @add_shl_add_constant_1_i32(i32 %x, i32 %y) nounwind { + %add.0 = add i32 %x, 123 + %shl = shl i32 %add.0, 3 + %add.1 = add i32 %shl, %y + ret i32 %add.1 +} + +; CHECK-LABEL: add_shl_add_constant_2_i32 +; CHECK: leal 984(%rsi,%rdi,8), %eax +; CHECK-NEXT: retq +define i32 @add_shl_add_constant_2_i32(i32 %x, i32 %y) nounwind { + %add.0 = add i32 %x, 123 + %shl = shl i32 %add.0, 3 + %add.1 = add i32 %y, %shl + ret i32 %add.1 +} + +; CHECK: LCPI2_0: +; CHECK: .long 984 +; CHECK: _add_shl_add_constant_1_v4i32 +; CHECK: pslld $3, %[[REG:xmm[0-9]+]] +; CHECK: paddd %xmm1, %[[REG]] +; CHECK: paddd LCPI2_0(%rip), %[[REG:xmm[0-9]+]] +; CHECK: retq +define <4 x i32> @add_shl_add_constant_1_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { + %add.0 = add <4 x i32> %x, <i32 123, i32 123, i32 123, i32 123> + %shl = shl <4 x i32> %add.0, <i32 3, i32 3, i32 3, i32 3> + %add.1 = add <4 x i32> %shl, %y + ret <4 x i32> %add.1 +} + +; CHECK: LCPI3_0: +; CHECK: .long 984 +; CHECK: _add_shl_add_constant_2_v4i32 +; CHECK: pslld $3, %[[REG:xmm[0-9]+]] +; CHECK: paddd %xmm1, %[[REG]] +; CHECK: paddd LCPI3_0(%rip), %[[REG:xmm[0-9]+]] +; CHECK: retq +define <4 x i32> @add_shl_add_constant_2_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind { + %add.0 = add <4 x i32> %x, <i32 123, i32 123, i32 123, i32 123> + %shl = shl <4 x i32> %add.0, <i32 3, i32 3, i32 3, i32 3> + %add.1 = add <4 x i32> %y, %shl + ret <4 x i32> %add.1 +} diff --git a/test/CodeGen/X86/addr-mode-matcher.ll b/test/CodeGen/X86/addr-mode-matcher.ll new file mode 100644 index 000000000000..d5920910f289 --- /dev/null +++ b/test/CodeGen/X86/addr-mode-matcher.ll @@ -0,0 +1,62 @@ +; RUN: llc < %s | FileCheck %s + +; This testcase used to hit an assert during ISel. For details, see the big +; comment inside the function. + +; CHECK-LABEL: foo: +; The AND should be turned into a subreg access. +; CHECK-NOT: and +; The shift (leal) should be folded into the scale of the address in the load. +; CHECK-NOT: leal +; CHECK: movl {{.*}},4), + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.6.0" + +define void @foo(i32 %a) { +bb: + br label %bb1692 + +bb1692: + %tmp1694 = phi i32 [ 0, %bb ], [ %tmp1745, %bb1692 ] + %xor = xor i32 0, %tmp1694 + +; %load1 = (load (and (shl %xor, 2), 1020)) + %tmp1701 = shl i32 %xor, 2 + %tmp1702 = and i32 %tmp1701, 1020 + %tmp1703 = getelementptr inbounds [1028 x i8]* null, i32 0, i32 %tmp1702 + %tmp1704 = bitcast i8* %tmp1703 to i32* + %load1 = load i32* %tmp1704, align 4 + +; %load2 = (load (shl (and %xor, 255), 2)) + %tmp1698 = and i32 %xor, 255 + %tmp1706 = shl i32 %tmp1698, 2 + %tmp1707 = getelementptr inbounds [1028 x i8]* null, i32 0, i32 %tmp1706 + %tmp1708 = bitcast i8* %tmp1707 to i32* + %load2 = load i32* %tmp1708, align 4 + + %tmp1710 = or i32 %load2, %a + +; While matching xor we address-match %load1. The and-of-shift reassocication +; in address matching transform this into into a shift-of-and and the resuting +; node becomes identical to %load2. CSE replaces %load1 which leaves its +; references in MatchScope and RecordedNodes stale. + %tmp1711 = xor i32 %load1, %tmp1710 + + %tmp1744 = getelementptr inbounds [256 x i32]* null, i32 0, i32 %tmp1711 + store i32 0, i32* %tmp1744, align 4 + %tmp1745 = add i32 %tmp1694, 1 + indirectbr i8* undef, [label %bb1756, label %bb1692] + +bb1756: + br label %bb2705 + +bb2705: + indirectbr i8* undef, [label %bb5721, label %bb5736] + +bb5721: + br label %bb2705 + +bb5736: + ret void +} diff --git a/test/CodeGen/X86/adx-intrinsics.ll b/test/CodeGen/X86/adx-intrinsics.ll new file mode 100644 index 000000000000..0498177a9c12 --- /dev/null +++ b/test/CodeGen/X86/adx-intrinsics.ll @@ -0,0 +1,77 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 --show-mc-encoding| FileCheck %s --check-prefix=NOADX --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=broadwell --show-mc-encoding| FileCheck %s --check-prefix=ADX --check-prefix=CHECK + +declare i8 @llvm.x86.addcarryx.u32(i8, i32, i32, i8*) + +define i8 @test_addcarryx_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) { +; CHECK-LABEL: test_addcarryx_u32 +; CHECK: addb +; ADX: adcxl +; CHECK: setb +; CHECK: retq + %ret = tail call i8 @llvm.x86.addcarryx.u32(i8 %c, i32 %a, i32 %b, i8* %ptr) + ret i8 %ret; +} + +declare i8 @llvm.x86.addcarryx.u64(i8, i64, i64, i8*) + +define i8 @test_addcarryx_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) { +; CHECK-LABEL: test_addcarryx_u64 +; CHECK: addb +; ADX: adcxq +; CHECK: setb +; CHECK: retq + %ret = tail call i8 @llvm.x86.addcarryx.u64(i8 %c, i64 %a, i64 %b, i8* %ptr) + ret i8 %ret; +} + +declare i8 @llvm.x86.addcarry.u32(i8, i32, i32, i8*) + +define i8 @test_addcarry_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) { +; CHECK-LABEL: test_addcarry_u32 +; CHECK: addb +; ADX: adcxl +; NOADX: adcl +; CHECK: setb +; CHECK: retq + %ret = tail call i8 @llvm.x86.addcarry.u32(i8 %c, i32 %a, i32 %b, i8* %ptr) + ret i8 %ret; +} + +declare i8 @llvm.x86.addcarry.u64(i8, i64, i64, i8*) + +define i8 @test_addcarry_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) { +; CHECK-LABEL: test_addcarry_u64 +; CHECK: addb +; ADX: adcxq +; NOADX: adcq +; CHECK: setb +; CHECK: retq + %ret = tail call i8 @llvm.x86.addcarry.u64(i8 %c, i64 %a, i64 %b, i8* %ptr) + ret i8 %ret; +} + +declare i8 @llvm.x86.subborrow.u32(i8, i32, i32, i8*) + +define i8 @test_subborrow_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) { +; CHECK-LABEL: test_subborrow_u32 +; CHECK: addb +; CHECK: sbbl +; CHECK: setb +; CHECK: retq + %ret = tail call i8 @llvm.x86.subborrow.u32(i8 %c, i32 %a, i32 %b, i8* %ptr) + ret i8 %ret; +} + +declare i8 @llvm.x86.subborrow.u64(i8, i64, i64, i8*) + +define i8 @test_subborrow_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) { +; CHECK-LABEL: test_subborrow_u64 +; CHECK: addb +; CHECK: sbbq +; CHECK: setb +; CHECK: retq + %ret = tail call i8 @llvm.x86.subborrow.u64(i8 %c, i64 %a, i64 %b, i8* %ptr) + ret i8 %ret; +} + diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll index bf55644de41e..82a8e482b7fa 100644 --- a/test/CodeGen/X86/aliases.ll +++ b/test/CodeGen/X86/aliases.ll @@ -30,12 +30,12 @@ define i32 @foo_f() { ret i32 0 } ; CHECK-DAG: .weak bar_f -@bar_f = alias weak %FunTy* @foo_f +@bar_f = weak alias %FunTy* @foo_f -@bar_l = alias linkonce_odr i32* @bar +@bar_l = linkonce_odr alias i32* @bar ; CHECK-DAG: .weak bar_l -@bar_i = alias internal i32* @bar +@bar_i = internal alias i32* @bar ; CHECK-DAG: .globl A @A = alias bitcast (i32* @bar to i64*) diff --git a/test/CodeGen/X86/aligned-variadic.ll b/test/CodeGen/X86/aligned-variadic.ll new file mode 100644 index 000000000000..e2155fe4b373 --- /dev/null +++ b/test/CodeGen/X86/aligned-variadic.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=X32 + +%struct.Baz = type { [17 x i8] } +%struct.__va_list_tag = type { i32, i32, i8*, i8* } + +; Function Attrs: nounwind uwtable +define void @bar(%struct.Baz* byval nocapture readnone align 8 %x, ...) { +entry: + %va = alloca [1 x %struct.__va_list_tag], align 16 + %arraydecay = getelementptr inbounds [1 x %struct.__va_list_tag]* %va, i64 0, i64 0 + %arraydecay1 = bitcast [1 x %struct.__va_list_tag]* %va to i8* + call void @llvm.va_start(i8* %arraydecay1) + %overflow_arg_area_p = getelementptr inbounds [1 x %struct.__va_list_tag]* %va, i64 0, i64 0, i32 2 + %overflow_arg_area = load i8** %overflow_arg_area_p, align 8 + %overflow_arg_area.next = getelementptr i8* %overflow_arg_area, i64 24 + store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8 +; X32: leal 68(%esp), [[REG:%.*]] +; X32: movl [[REG]], 16(%esp) +; X64: leaq 232(%rsp), [[REG:%.*]] +; X64: movq [[REG]], 184(%rsp) +; X64: leaq 176(%rsp), %rdi + call void @qux(%struct.__va_list_tag* %arraydecay) + ret void +} + +; Function Attrs: nounwind +declare void @llvm.va_start(i8*) + +declare void @qux(%struct.__va_list_tag*) diff --git a/test/CodeGen/X86/alloca-align-rounding.ll b/test/CodeGen/X86/alloca-align-rounding.ll index 74b9470db752..9d8b6cfa6730 100644 --- a/test/CodeGen/X86/alloca-align-rounding.ll +++ b/test/CodeGen/X86/alloca-align-rounding.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux -enable-misched=false | FileCheck %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnux32 -enable-misched=false | FileCheck %s -check-prefix=X32ABI declare void @bar(<2 x i64>* %n) @@ -6,15 +7,29 @@ define void @foo(i64 %h) { %p = alloca <2 x i64>, i64 %h call void @bar(<2 x i64>* %p) ret void -; CHECK: foo +; CHECK-LABEL: foo ; CHECK-NOT: andq $-32, %rax +; X32ABI-LABEL: foo +; X32ABI-NOT: andl $-32, %eax } define void @foo2(i64 %h) { %p = alloca <2 x i64>, i64 %h, align 32 call void @bar(<2 x i64>* %p) ret void -; CHECK: foo2 +; CHECK-LABEL: foo2 ; CHECK: andq $-32, %rsp ; CHECK: andq $-32, %rax +; X32ABI-LABEL: foo2 +; X32ABI: andl $-32, %esp +; X32ABI: andl $-32, %eax +} + +define void @foo3(i64 %h) { + %p = alloca <2 x i64>, i64 %h + ret void +; CHECK-LABEL: foo3 +; CHECK: movq %rbp, %rsp +; X32ABI-LABEL: foo3 +; X32ABI: movl %ebp, %esp } diff --git a/test/CodeGen/X86/asm-block-labels.ll b/test/CodeGen/X86/asm-block-labels.ll index 6dbfb16a6d50..93524386c6ba 100644 --- a/test/CodeGen/X86/asm-block-labels.ll +++ b/test/CodeGen/X86/asm-block-labels.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -std-compile-opts | llc -no-integrated-as +; RUN: opt < %s -O3 | llc -no-integrated-as ; ModuleID = 'block12.c' target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i686-apple-darwin8" diff --git a/test/CodeGen/X86/asm-label.ll b/test/CodeGen/X86/asm-label.ll index 1fc6e2eaf2b7..1da66e74d34f 100644 --- a/test/CodeGen/X86/asm-label.ll +++ b/test/CodeGen/X86/asm-label.ll @@ -24,7 +24,7 @@ if.end: ; preds = %if.then br label %cleanup cleanup: ; preds = %if.end, %if.then9 - switch i32 undef, label %unreachable [ + switch i32 undef, label %default [ i32 0, label %cleanup.cont i32 1, label %if.end11 ] @@ -35,6 +35,6 @@ cleanup.cont: ; preds = %cleanup if.end11: ; preds = %cleanup.cont, %cleanup, %land.lhs.true, %entry ret void -unreachable: ; preds = %cleanup - unreachable +default: ; preds = %cleanup + br label %if.end11 } diff --git a/test/CodeGen/X86/atomic-load-store-wide.ll b/test/CodeGen/X86/atomic-load-store-wide.ll index 7352d5a58006..ad1a5c6d0267 100644 --- a/test/CodeGen/X86/atomic-load-store-wide.ll +++ b/test/CodeGen/X86/atomic-load-store-wide.ll @@ -4,16 +4,18 @@ ; FIXME: The generated code can be substantially improved. define void @test1(i64* %ptr, i64 %val1) { -; CHECK: test1 -; CHECK: cmpxchg8b +; CHECK-LABEL: test1 +; CHECK: lock +; CHECK-NEXT: cmpxchg8b ; CHECK-NEXT: jne store atomic i64 %val1, i64* %ptr seq_cst, align 8 ret void } define i64 @test2(i64* %ptr) { -; CHECK: test2 -; CHECK: cmpxchg8b +; CHECK-LABEL: test2 +; CHECK: lock +; CHECK-NEXT: cmpxchg8b %val = load atomic i64* %ptr seq_cst, align 8 ret i64 %val } diff --git a/test/CodeGen/X86/atomic16.ll b/test/CodeGen/X86/atomic16.ll index faaa4c49d39b..f6892de43d89 100644 --- a/test/CodeGen/X86/atomic16.ll +++ b/test/CodeGen/X86/atomic16.ll @@ -15,17 +15,17 @@ entry: ; X32: incw %t2 = atomicrmw add i16* @sc16, i16 3 acquire ; X64: lock -; X64: addw $3, {{.*}} # encoding: [0xf0,0x66 +; X64: addw $3, {{.*}} # encoding: [0x66,0xf0 ; X32: lock ; X32: addw $3 %t3 = atomicrmw add i16* @sc16, i16 5 acquire ; X64: lock -; X64: xaddw {{.*}} # encoding: [0xf0,0x66 +; X64: xaddw {{.*}} # encoding: [0x66,0xf0 ; X32: lock ; X32: xaddw %t4 = atomicrmw add i16* @sc16, i16 %t3 acquire ; X64: lock -; X64: addw {{.*}} # encoding: [0xf0,0x66 +; X64: addw {{.*}} # encoding: [0x66,0xf0 ; X32: lock ; X32: addw ret void @@ -43,17 +43,17 @@ define void @atomic_fetch_sub16() nounwind { ; X32: decw %t2 = atomicrmw sub i16* @sc16, i16 3 acquire ; X64: lock -; X64: subw $3, {{.*}} # encoding: [0xf0,0x66 +; X64: subw $3, {{.*}} # encoding: [0x66,0xf0 ; X32: lock ; X32: subw $3 %t3 = atomicrmw sub i16* @sc16, i16 5 acquire ; X64: lock -; X64: xaddw {{.*}} # encoding: [0xf0,0x66 +; X64: xaddw {{.*}} # encoding: [0x66,0xf0 ; X32: lock ; X32: xaddw %t4 = atomicrmw sub i16* @sc16, i16 %t3 acquire ; X64: lock -; X64: subw {{.*}} # encoding: [0xf0,0x66 +; X64: subw {{.*}} # encoding: [0x66,0xf0 ; X32: lock ; X32: subw ret void @@ -66,7 +66,7 @@ define void @atomic_fetch_and16() nounwind { ; X32-LABEL: atomic_fetch_and16 %t1 = atomicrmw and i16* @sc16, i16 3 acquire ; X64: lock -; X64: andw $3, {{.*}} # encoding: [0xf0,0x66 +; X64: andw $3, {{.*}} # encoding: [0x66,0xf0 ; X32: lock ; X32: andw $3 %t2 = atomicrmw and i16* @sc16, i16 5 acquire @@ -78,7 +78,7 @@ define void @atomic_fetch_and16() nounwind { ; X32: cmpxchgw %t3 = atomicrmw and i16* @sc16, i16 %t2 acquire ; X64: lock -; X64: andw {{.*}} # encoding: [0xf0,0x66 +; X64: andw {{.*}} # encoding: [0x66,0xf0 ; X32: lock ; X32: andw ret void @@ -91,7 +91,7 @@ define void @atomic_fetch_or16() nounwind { ; X32-LABEL: atomic_fetch_or16 %t1 = atomicrmw or i16* @sc16, i16 3 acquire ; X64: lock -; X64: orw $3, {{.*}} # encoding: [0xf0,0x66 +; X64: orw $3, {{.*}} # encoding: [0x66,0xf0 ; X32: lock ; X32: orw $3 %t2 = atomicrmw or i16* @sc16, i16 5 acquire @@ -103,7 +103,7 @@ define void @atomic_fetch_or16() nounwind { ; X32: cmpxchgw %t3 = atomicrmw or i16* @sc16, i16 %t2 acquire ; X64: lock -; X64: orw {{.*}} # encoding: [0xf0,0x66 +; X64: orw {{.*}} # encoding: [0x66,0xf0 ; X32: lock ; X32: orw ret void @@ -116,7 +116,7 @@ define void @atomic_fetch_xor16() nounwind { ; X32-LABEL: atomic_fetch_xor16 %t1 = atomicrmw xor i16* @sc16, i16 3 acquire ; X64: lock -; X64: xorw $3, {{.*}} # encoding: [0xf0,0x66 +; X64: xorw $3, {{.*}} # encoding: [0x66,0xf0 ; X32: lock ; X32: xorw $3 %t2 = atomicrmw xor i16* @sc16, i16 5 acquire @@ -128,7 +128,7 @@ define void @atomic_fetch_xor16() nounwind { ; X32: cmpxchgw %t3 = atomicrmw xor i16* @sc16, i16 %t2 acquire ; X64: lock -; X64: xorw {{.*}} # encoding: [0xf0,0x66 +; X64: xorw {{.*}} # encoding: [0x66,0xf0 ; X32: lock ; X32: xorw ret void diff --git a/test/CodeGen/X86/atomic_add.ll b/test/CodeGen/X86/atomic_add.ll index bdd25e6a2a56..f60212de5339 100644 --- a/test/CodeGen/X86/atomic_add.ll +++ b/test/CodeGen/X86/atomic_add.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC ; rdar://7103704 @@ -14,6 +15,8 @@ define void @inc4(i64* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: inc4: ; CHECK: incq +; SLOW_INC-LABEL: inc4: +; SLOW_INC-NOT: incq %0 = atomicrmw add i64* %p, i64 1 monotonic ret void } @@ -39,6 +42,8 @@ define void @inc3(i8* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: inc3: ; CHECK: incb +; SLOW_INC-LABEL: inc3: +; SLOW_INC-NOT: incb %0 = atomicrmw add i8* %p, i8 1 monotonic ret void } @@ -64,6 +69,8 @@ define void @inc2(i16* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: inc2: ; CHECK: incw +; SLOW_INC-LABEL: inc2: +; SLOW_INC-NOT: incw %0 = atomicrmw add i16* %p, i16 1 monotonic ret void } @@ -89,6 +96,8 @@ define void @inc1(i32* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: inc1: ; CHECK: incl +; SLOW_INC-LABEL: inc1: +; SLOW_INC-NOT: incl %0 = atomicrmw add i32* %p, i32 1 monotonic ret void } @@ -113,6 +122,8 @@ define void @dec4(i64* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: dec4: ; CHECK: decq +; SLOW_INC-LABEL: dec4: +; SLOW_INC-NOT: decq %0 = atomicrmw sub i64* %p, i64 1 monotonic ret void } @@ -138,6 +149,8 @@ define void @dec3(i8* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: dec3: ; CHECK: decb +; SLOW_INC-LABEL: dec3: +; SLOW_INC-NOT: decb %0 = atomicrmw sub i8* %p, i8 1 monotonic ret void } @@ -163,6 +176,8 @@ define void @dec2(i16* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: dec2: ; CHECK: decw +; SLOW_INC-LABEL: dec2: +; SLOW_INC-NOT: decw %0 = atomicrmw sub i16* %p, i16 1 monotonic ret void } @@ -189,6 +204,8 @@ define void @dec1(i32* nocapture %p) nounwind ssp { entry: ; CHECK-LABEL: dec1: ; CHECK: decl +; SLOW_INC-LABEL: dec1: +; SLOW_INC-NOT: decl %0 = atomicrmw sub i32* %p, i32 1 monotonic ret void } diff --git a/test/CodeGen/X86/atomic_idempotent.ll b/test/CodeGen/X86/atomic_idempotent.ll new file mode 100644 index 000000000000..1afc535133d6 --- /dev/null +++ b/test/CodeGen/X86/atomic_idempotent.ll @@ -0,0 +1,56 @@ +; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64 +; RUN: llc < %s -march=x86 -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32 + +; On x86, an atomic rmw operation that does not modify the value in memory +; (such as atomic add 0) can be replaced by an mfence followed by a mov. +; This is explained (with the motivation for such an optimization) in +; http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf + +define i8 @add8(i8* %p) { +; CHECK-LABEL: add8 +; CHECK: mfence +; CHECK: movb + %1 = atomicrmw add i8* %p, i8 0 monotonic + ret i8 %1 +} + +define i16 @or16(i16* %p) { +; CHECK-LABEL: or16 +; CHECK: mfence +; CHECK: movw + %1 = atomicrmw or i16* %p, i16 0 acquire + ret i16 %1 +} + +define i32 @xor32(i32* %p) { +; CHECK-LABEL: xor32 +; CHECK: mfence +; CHECK: movl + %1 = atomicrmw xor i32* %p, i32 0 release + ret i32 %1 +} + +define i64 @sub64(i64* %p) { +; CHECK-LABEL: sub64 +; X64: mfence +; X64: movq +; X32-NOT: mfence + %1 = atomicrmw sub i64* %p, i64 0 seq_cst + ret i64 %1 +} + +define i128 @or128(i128* %p) { +; CHECK-LABEL: or128 +; CHECK-NOT: mfence + %1 = atomicrmw or i128* %p, i128 0 monotonic + ret i128 %1 +} + +; For 'and', the idempotent value is (-1) +define i32 @and32 (i32* %p) { +; CHECK-LABEL: and32 +; CHECK: mfence +; CHECK: movl + %1 = atomicrmw and i32* %p, i32 -1 acq_rel + ret i32 %1 +} diff --git a/test/CodeGen/X86/atomic_mi.ll b/test/CodeGen/X86/atomic_mi.ll new file mode 100644 index 000000000000..19e019eaddcd --- /dev/null +++ b/test/CodeGen/X86/atomic_mi.ll @@ -0,0 +1,525 @@ +; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix X64 +; RUN: llc < %s -march=x86 -verify-machineinstrs | FileCheck %s --check-prefix X32 +; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC + +; This file checks that atomic (non-seq_cst) stores of immediate values are +; done in one mov instruction and not 2. More precisely, it makes sure that the +; immediate is not first copied uselessly into a register. + +; Similarily, it checks that a binary operation of an immediate with an atomic +; variable that is stored back in that variable is done as a single instruction. +; For example: x.store(42 + x.load(memory_order_acquire), memory_order_release) +; should be just an add instruction, instead of loading x into a register, doing +; an add and storing the result back. +; The binary operations supported are currently add, and, or, xor. +; sub is not supported because they are translated by an addition of the +; negated immediate. +; Finally, we also check the same kind of pattern for inc/dec + +; seq_cst stores are left as (lock) xchgl, but we try to check every other +; attribute at least once. + +; Please note that these operations do not require the lock prefix: only +; sequentially consistent stores require this kind of protection on X86. +; And even for seq_cst operations, llvm uses the xchg instruction which has +; an implicit lock prefix, so making it explicit is not required. + +define void @store_atomic_imm_8(i8* %p) { +; X64-LABEL: store_atomic_imm_8 +; X64: movb +; X64-NOT: movb +; X32-LABEL: store_atomic_imm_8 +; X32: movb +; X32-NOT: movb + store atomic i8 42, i8* %p release, align 1 + ret void +} + +define void @store_atomic_imm_16(i16* %p) { +; X64-LABEL: store_atomic_imm_16 +; X64: movw +; X64-NOT: movw +; X32-LABEL: store_atomic_imm_16 +; X32: movw +; X32-NOT: movw + store atomic i16 42, i16* %p monotonic, align 2 + ret void +} + +define void @store_atomic_imm_32(i32* %p) { +; X64-LABEL: store_atomic_imm_32 +; X64: movl +; X64-NOT: movl +; On 32 bits, there is an extra movl for each of those functions +; (probably for alignment reasons). +; X32-LABEL: store_atomic_imm_32 +; X32: movl 4(%esp), %eax +; X32: movl +; X32-NOT: movl + store atomic i32 42, i32* %p release, align 4 + ret void +} + +define void @store_atomic_imm_64(i64* %p) { +; X64-LABEL: store_atomic_imm_64 +; X64: movq +; X64-NOT: movq +; These are implemented with a CAS loop on 32 bit architectures, and thus +; cannot be optimized in the same way as the others. +; X32-LABEL: store_atomic_imm_64 +; X32: cmpxchg8b + store atomic i64 42, i64* %p release, align 8 + ret void +} + +; If an immediate is too big to fit in 32 bits, it cannot be store in one mov, +; even on X64, one must use movabsq that can only target a register. +define void @store_atomic_imm_64_big(i64* %p) { +; X64-LABEL: store_atomic_imm_64_big +; X64: movabsq +; X64: movq + store atomic i64 100000000000, i64* %p monotonic, align 8 + ret void +} + +; It would be incorrect to replace a lock xchgl by a movl +define void @store_atomic_imm_32_seq_cst(i32* %p) { +; X64-LABEL: store_atomic_imm_32_seq_cst +; X64: xchgl +; X32-LABEL: store_atomic_imm_32_seq_cst +; X32: xchgl + store atomic i32 42, i32* %p seq_cst, align 4 + ret void +} + +; ----- ADD ----- + +define void @add_8(i8* %p) { +; X64-LABEL: add_8 +; X64-NOT: lock +; X64: addb +; X64-NOT: movb +; X32-LABEL: add_8 +; X32-NOT: lock +; X32: addb +; X32-NOT: movb + %1 = load atomic i8* %p seq_cst, align 1 + %2 = add i8 %1, 2 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @add_16(i16* %p) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: add_16 +; X64-NOT: addw +; X32-LABEL: add_16 +; X32-NOT: addw + %1 = load atomic i16* %p acquire, align 2 + %2 = add i16 %1, 2 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @add_32(i32* %p) { +; X64-LABEL: add_32 +; X64-NOT: lock +; X64: addl +; X64-NOT: movl +; X32-LABEL: add_32 +; X32-NOT: lock +; X32: addl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = add i32 %1, 2 + store atomic i32 %2, i32* %p monotonic, align 4 + ret void +} + +define void @add_64(i64* %p) { +; X64-LABEL: add_64 +; X64-NOT: lock +; X64: addq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'addq'. +; X32-LABEL: add_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = add i64 %1, 2 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @add_32_seq_cst(i32* %p) { +; X64-LABEL: add_32_seq_cst +; X64: xchgl +; X32-LABEL: add_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = add i32 %1, 2 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- AND ----- + +define void @and_8(i8* %p) { +; X64-LABEL: and_8 +; X64-NOT: lock +; X64: andb +; X64-NOT: movb +; X32-LABEL: and_8 +; X32-NOT: lock +; X32: andb +; X32-NOT: movb + %1 = load atomic i8* %p monotonic, align 1 + %2 = and i8 %1, 2 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @and_16(i16* %p) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: and_16 +; X64-NOT: andw +; X32-LABEL: and_16 +; X32-NOT: andw + %1 = load atomic i16* %p acquire, align 2 + %2 = and i16 %1, 2 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @and_32(i32* %p) { +; X64-LABEL: and_32 +; X64-NOT: lock +; X64: andl +; X64-NOT: movl +; X32-LABEL: and_32 +; X32-NOT: lock +; X32: andl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = and i32 %1, 2 + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @and_64(i64* %p) { +; X64-LABEL: and_64 +; X64-NOT: lock +; X64: andq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'andq'. +; X32-LABEL: and_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = and i64 %1, 2 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @and_32_seq_cst(i32* %p) { +; X64-LABEL: and_32_seq_cst +; X64: xchgl +; X32-LABEL: and_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = and i32 %1, 2 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- OR ----- + +define void @or_8(i8* %p) { +; X64-LABEL: or_8 +; X64-NOT: lock +; X64: orb +; X64-NOT: movb +; X32-LABEL: or_8 +; X32-NOT: lock +; X32: orb +; X32-NOT: movb + %1 = load atomic i8* %p acquire, align 1 + %2 = or i8 %1, 2 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @or_16(i16* %p) { +; X64-LABEL: or_16 +; X64-NOT: orw +; X32-LABEL: or_16 +; X32-NOT: orw + %1 = load atomic i16* %p acquire, align 2 + %2 = or i16 %1, 2 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @or_32(i32* %p) { +; X64-LABEL: or_32 +; X64-NOT: lock +; X64: orl +; X64-NOT: movl +; X32-LABEL: or_32 +; X32-NOT: lock +; X32: orl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = or i32 %1, 2 + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @or_64(i64* %p) { +; X64-LABEL: or_64 +; X64-NOT: lock +; X64: orq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'orq'. +; X32-LABEL: or_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = or i64 %1, 2 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @or_32_seq_cst(i32* %p) { +; X64-LABEL: or_32_seq_cst +; X64: xchgl +; X32-LABEL: or_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = or i32 %1, 2 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- XOR ----- + +define void @xor_8(i8* %p) { +; X64-LABEL: xor_8 +; X64-NOT: lock +; X64: xorb +; X64-NOT: movb +; X32-LABEL: xor_8 +; X32-NOT: lock +; X32: xorb +; X32-NOT: movb + %1 = load atomic i8* %p acquire, align 1 + %2 = xor i8 %1, 2 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @xor_16(i16* %p) { +; X64-LABEL: xor_16 +; X64-NOT: xorw +; X32-LABEL: xor_16 +; X32-NOT: xorw + %1 = load atomic i16* %p acquire, align 2 + %2 = xor i16 %1, 2 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @xor_32(i32* %p) { +; X64-LABEL: xor_32 +; X64-NOT: lock +; X64: xorl +; X64-NOT: movl +; X32-LABEL: xor_32 +; X32-NOT: lock +; X32: xorl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = xor i32 %1, 2 + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @xor_64(i64* %p) { +; X64-LABEL: xor_64 +; X64-NOT: lock +; X64: xorq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'xorq'. +; X32-LABEL: xor_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = xor i64 %1, 2 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @xor_32_seq_cst(i32* %p) { +; X64-LABEL: xor_32_seq_cst +; X64: xchgl +; X32-LABEL: xor_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = xor i32 %1, 2 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- INC ----- + +define void @inc_8(i8* %p) { +; X64-LABEL: inc_8 +; X64-NOT: lock +; X64: incb +; X64-NOT: movb +; X32-LABEL: inc_8 +; X32-NOT: lock +; X32: incb +; X32-NOT: movb +; SLOW_INC-LABEL: inc_8 +; SLOW_INC-NOT: incb +; SLOW_INC-NOT: movb + %1 = load atomic i8* %p seq_cst, align 1 + %2 = add i8 %1, 1 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @inc_16(i16* %p) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: inc_16 +; X64-NOT: incw +; X32-LABEL: inc_16 +; X32-NOT: incw +; SLOW_INC-LABEL: inc_16 +; SLOW_INC-NOT: incw + %1 = load atomic i16* %p acquire, align 2 + %2 = add i16 %1, 1 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @inc_32(i32* %p) { +; X64-LABEL: inc_32 +; X64-NOT: lock +; X64: incl +; X64-NOT: movl +; X32-LABEL: inc_32 +; X32-NOT: lock +; X32: incl +; X32-NOT: movl +; SLOW_INC-LABEL: inc_32 +; SLOW_INC-NOT: incl +; SLOW_INC-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = add i32 %1, 1 + store atomic i32 %2, i32* %p monotonic, align 4 + ret void +} + +define void @inc_64(i64* %p) { +; X64-LABEL: inc_64 +; X64-NOT: lock +; X64: incq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'incq'. +; X32-LABEL: inc_64 +; SLOW_INC-LABEL: inc_64 +; SLOW_INC-NOT: incq +; SLOW_INC-NOT: movq + %1 = load atomic i64* %p acquire, align 8 + %2 = add i64 %1, 1 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @inc_32_seq_cst(i32* %p) { +; X64-LABEL: inc_32_seq_cst +; X64: xchgl +; X32-LABEL: inc_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = add i32 %1, 1 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- DEC ----- + +define void @dec_8(i8* %p) { +; X64-LABEL: dec_8 +; X64-NOT: lock +; X64: decb +; X64-NOT: movb +; X32-LABEL: dec_8 +; X32-NOT: lock +; X32: decb +; X32-NOT: movb +; SLOW_INC-LABEL: dec_8 +; SLOW_INC-NOT: decb +; SLOW_INC-NOT: movb + %1 = load atomic i8* %p seq_cst, align 1 + %2 = sub i8 %1, 1 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @dec_16(i16* %p) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: dec_16 +; X64-NOT: decw +; X32-LABEL: dec_16 +; X32-NOT: decw +; SLOW_INC-LABEL: dec_16 +; SLOW_INC-NOT: decw + %1 = load atomic i16* %p acquire, align 2 + %2 = sub i16 %1, 1 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @dec_32(i32* %p) { +; X64-LABEL: dec_32 +; X64-NOT: lock +; X64: decl +; X64-NOT: movl +; X32-LABEL: dec_32 +; X32-NOT: lock +; X32: decl +; X32-NOT: movl +; SLOW_INC-LABEL: dec_32 +; SLOW_INC-NOT: decl +; SLOW_INC-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = sub i32 %1, 1 + store atomic i32 %2, i32* %p monotonic, align 4 + ret void +} + +define void @dec_64(i64* %p) { +; X64-LABEL: dec_64 +; X64-NOT: lock +; X64: decq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'decq'. +; X32-LABEL: dec_64 +; SLOW_INC-LABEL: dec_64 +; SLOW_INC-NOT: decq +; SLOW_INC-NOT: movq + %1 = load atomic i64* %p acquire, align 8 + %2 = sub i64 %1, 1 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @dec_32_seq_cst(i32* %p) { +; X64-LABEL: dec_32_seq_cst +; X64: xchgl +; X32-LABEL: dec_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = sub i32 %1, 1 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} diff --git a/test/CodeGen/X86/avoid_complex_am.ll b/test/CodeGen/X86/avoid_complex_am.ll index 7f095190ab8f..e5e7bd23a641 100644 --- a/test/CodeGen/X86/avoid_complex_am.ll +++ b/test/CodeGen/X86/avoid_complex_am.ll @@ -22,7 +22,7 @@ for.body: ; preds = %for.body, %entry %arrayidx = getelementptr inbounds double* %b, i64 %tmp %tmp1 = load double* %arrayidx, align 8 ; The induction variable should carry the scaling factor: 1. -; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK: [[IVNEXT]] = add nuw i64 [[IV]], 1 %indvars.iv.next = add i64 %indvars.iv, 1 %arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next %tmp2 = load double* %arrayidx2, align 8 diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll index 1fd9085838df..02ea173c8032 100644 --- a/test/CodeGen/X86/avx-basic.ll +++ b/test/CodeGen/X86/avx-basic.ll @@ -51,46 +51,6 @@ entry: ret <4 x i64> %shuffle } -;;; -;;; Check that some 256-bit vectors are xformed into 128 ops -; CHECK: _A -; CHECK: vshufpd $1 -; CHECK-NEXT: vextractf128 $1 -; CHECK-NEXT: vshufpd $1 -; CHECK-NEXT: vinsertf128 $1 -define <4 x i64> @A(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6> - ret <4 x i64> %shuffle -} - -; CHECK: _B -; CHECK: vshufpd $1, %ymm -define <4 x i64> @B(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 undef, i32 undef, i32 6> - ret <4 x i64> %shuffle -} - -; CHECK: movlhps -; CHECK-NEXT: vextractf128 $1 -; CHECK-NEXT: movlhps -; CHECK-NEXT: vinsertf128 $1 -define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 undef, i32 0, i32 undef, i32 6> - ret <4 x i64> %shuffle -} - -; CHECK: vpshufd $-96 -; CHECK: vpshufd $-6 -; CHECK: vinsertf128 $1 -define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 10, i32 10, i32 11, i32 11> - ret <8 x i32> %shuffle -} - ;;; Don't crash on movd ; CHECK: _VMOVZQI2PQI ; CHECK: vmovd (% diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll deleted file mode 100644 index d2a22d709474..000000000000 --- a/test/CodeGen/X86/avx-blend.ll +++ /dev/null @@ -1,202 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; AVX128 tests: - -;CHECK-LABEL: vsel_float: -; select mask is <i1 true, i1 false, i1 true, i1 false>. -; Big endian representation is 0101 = 5. -; '1' means takes the first argument, '0' means takes the second argument. -; This is the opposite of the intel syntax, thus we expect -; the inverted mask: 1010 = 10. -; According to the ABI: -; v1 is in xmm0 => first argument is xmm0. -; v2 is in xmm1 => second argument is xmm1. -; result is in xmm0 => destination argument. -;CHECK: vblendps $10, %xmm1, %xmm0, %xmm0 -;CHECK: ret -define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2 - ret <4 x float> %vsel -} - - -;CHECK-LABEL: vsel_i32: -;CHECK: vblendps $10, %xmm1, %xmm0, %xmm0 -;CHECK: ret -define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2 - ret <4 x i32> %vsel -} - - -;CHECK-LABEL: vsel_double: -;CHECK: vmovsd -;CHECK: ret -define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) { - %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2 - ret <2 x double> %vsel -} - - -;CHECK-LABEL: vsel_i64: -;CHECK: vmovsd -;CHECK: ret -define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) { - %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2 - ret <2 x i64> %vsel -} - - -;CHECK-LABEL: vsel_i8: -;CHECK: vpblendvb -;CHECK: ret -define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { - %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2 - ret <16 x i8> %vsel -} - - -; AVX256 tests: - - -;CHECK-LABEL: vsel_float8: -;CHECK-NOT: vinsertf128 -; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false> -; which translates into the boolean mask (big endian representation): -; 00010001 = 17. -; '1' means takes the first argument, '0' means takes the second argument. -; This is the opposite of the intel syntax, thus we expect -; the inverted mask: 11101110 = 238. -;CHECK: vblendps $238, %ymm1, %ymm0, %ymm0 -;CHECK: ret -define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) { - %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2 - ret <8 x float> %vsel -} - -;CHECK-LABEL: vsel_i328: -;CHECK-NOT: vinsertf128 -;CHECK: vblendps $238, %ymm1, %ymm0, %ymm0 -;CHECK-NEXT: ret -define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) { - %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2 - ret <8 x i32> %vsel -} - -;CHECK-LABEL: vsel_double8: -; select mask is 2x: 0001 => intel mask: ~0001 = 14 -; ABI: -; v1 is in ymm0 and ymm1. -; v2 is in ymm2 and ymm3. -; result is in ymm0 and ymm1. -; Compute the low part: res.low = blend v1.low, v2.low, blendmask -;CHECK: vblendpd $14, %ymm2, %ymm0, %ymm0 -; Compute the high part. -;CHECK: vblendpd $14, %ymm3, %ymm1, %ymm1 -;CHECK: ret -define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) { - %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2 - ret <8 x double> %vsel -} - -;CHECK-LABEL: vsel_i648: -;CHECK: vblendpd $14, %ymm2, %ymm0, %ymm0 -;CHECK: vblendpd $14, %ymm3, %ymm1, %ymm1 -;CHECK: ret -define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { - %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2 - ret <8 x i64> %vsel -} - -;CHECK-LABEL: vsel_double4: -;CHECK-NOT: vinsertf128 -;CHECK: vblendpd $10 -;CHECK-NEXT: ret -define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2 - ret <4 x double> %vsel -} - -;; TEST blend + compares -; CHECK: testa -define <2 x double> @testa(<2 x double> %x, <2 x double> %y) { - ; CHECK: vcmplepd - ; CHECK: vblendvpd - %max_is_x = fcmp oge <2 x double> %x, %y - %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y - ret <2 x double> %max -} - -; CHECK: testb -define <2 x double> @testb(<2 x double> %x, <2 x double> %y) { - ; CHECK: vcmpnlepd - ; CHECK: vblendvpd - %min_is_x = fcmp ult <2 x double> %x, %y - %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y - ret <2 x double> %min -} - -; If we can figure out a blend has a constant mask, we should emit the -; blend instruction with an immediate mask -define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) { -; CHECK-LABEL: constant_blendvpd_avx: -; CHECK-NOT: mov -; CHECK: vblendpd -; CHECK: ret - %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab - ret <4 x double> %1 -} - -define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) { -; CHECK-LABEL: constant_blendvps_avx: -; CHECK-NOT: mov -; CHECK: vblendps -; CHECK: ret - %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd - ret <8 x float> %1 -} - -declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) -declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) - -;; 4 tests for shufflevectors that optimize to blend + immediate -; CHECK-LABEL: @blend_shufflevector_4xfloat -define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) { -; Equivalent select mask is <i1 true, i1 false, i1 true, i1 false>. -; Big endian representation is 0101 = 5. -; '1' means takes the first argument, '0' means takes the second argument. -; This is the opposite of the intel syntax, thus we expect -; Inverted mask: 1010 = 10. -; According to the ABI: -; a is in xmm0 => first argument is xmm0. -; b is in xmm1 => second argument is xmm1. -; Result is in xmm0 => destination argument. -; CHECK: vblendps $10, %xmm1, %xmm0, %xmm0 -; CHECK: ret - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x float> %1 -} - -; CHECK-LABEL: @blend_shufflevector_8xfloat -define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) { -; CHECK: vblendps $190, %ymm1, %ymm0, %ymm0 -; CHECK: ret - %1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 15> - ret <8 x float> %1 -} - -; CHECK-LABEL: @blend_shufflevector_4xdouble -define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) { -; CHECK: vblendpd $2, %ymm1, %ymm0, %ymm0 -; CHECK: ret - %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> - ret <4 x double> %1 -} - -; CHECK-LABEL: @blend_shufflevector_4xi64 -define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) { -; CHECK: vblendpd $13, %ymm1, %ymm0, %ymm0 -; CHECK: ret - %1 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> - ret <4 x i64> %1 -} diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll index 3e051bff768d..70ec1248cdd7 100644 --- a/test/CodeGen/X86/avx-intel-ocl.ll +++ b/test/CodeGen/X86/avx-intel-ocl.ll @@ -89,23 +89,23 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind { ; WIN64: vmovaps {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload ; X64-LABEL: test_prolog_epilog -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill -; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Folded Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill +; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp) ## 32-byte Spill ; X64: call -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload -; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Folded Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload +; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind { %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b) ret <16 x float> %c diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll new file mode 100644 index 000000000000..d2b44cd64efb --- /dev/null +++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s + +define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) { + ; CHECK: vblendpd + %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1] + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone + + +define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) { + ; CHECK: vblendps + %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1] + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone + + +define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) { + ; CHECK: vdpps + %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1] + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone + + diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll index ce31161dbbcd..bb9354cff038 100644 --- a/test/CodeGen/X86/avx-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx-intrinsics-x86.ll @@ -458,7 +458,7 @@ declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { - ; CHECK: vpslldq + ; CHECK: vpslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -466,7 +466,7 @@ declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { - ; CHECK: vpslldq + ; CHECK: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -554,7 +554,7 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { - ; CHECK: vpsrldq + ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -562,7 +562,7 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { - ; CHECK: vpsrldq + ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -818,18 +818,18 @@ declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK: vblendpd - %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1] + %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1] ret <2 x double> %res } -declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone +declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) { ; CHECK: vblendps - %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } -declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { @@ -850,35 +850,35 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) { ; CHECK: vdppd - %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1] + %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1] ret <2 x double> %res } -declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone +declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) { ; CHECK: vdpps - %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } -declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) { ; CHECK: vinsertps - %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } -declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK: vmpsadbw - %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1] + %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } -declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone +declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) { @@ -899,10 +899,10 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK: vpblendw - %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1] + %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } -declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone +declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) { @@ -1770,18 +1770,18 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) { ; CHECK: vblendpd - %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1] + %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1] ret <4 x double> %res } -declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone +declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) { ; CHECK: vblendps - %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1] + %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1] ret <8 x float> %res } -declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone +declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { @@ -1950,10 +1950,10 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) { ; CHECK: vdpps - %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1] + %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1] ret <8 x float> %res } -declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone +declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1) { @@ -2309,7 +2309,7 @@ declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) noun define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) { ; CHECK: vpermilpd - %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 7) ; <<2 x double>> [#uses=1] + %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1] ret <2 x double> %res } declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnone @@ -2324,7 +2324,7 @@ declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind rea define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) { - ; CHECK: vpshufd + ; CHECK: vpermilps %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } diff --git a/test/CodeGen/X86/avx-movdup.ll b/test/CodeGen/X86/avx-movdup.ll deleted file mode 100644 index 42d84def98a0..000000000000 --- a/test/CodeGen/X86/avx-movdup.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: vmovsldup -define <8 x float> @movdupA(<8 x float> %src) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x float> %src, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> - ret <8 x float> %shuffle.i -} - -; CHECK: vmovshdup -define <8 x float> @movdupB(<8 x float> %src) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x float> %src, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> - ret <8 x float> %shuffle.i -} - -; CHECK: vmovsldup -define <4 x i64> @movdupC(<4 x i64> %src) nounwind uwtable readnone ssp { -entry: - %0 = bitcast <4 x i64> %src to <8 x float> - %shuffle.i = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> - %1 = bitcast <8 x float> %shuffle.i to <4 x i64> - ret <4 x i64> %1 -} - -; CHECK: vmovshdup -define <4 x i64> @movdupD(<4 x i64> %src) nounwind uwtable readnone ssp { -entry: - %0 = bitcast <4 x i64> %src to <8 x float> - %shuffle.i = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> - %1 = bitcast <8 x float> %shuffle.i to <4 x i64> - ret <4 x i64> %1 -} - diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll deleted file mode 100755 index fb2287f52892..000000000000 --- a/test/CodeGen/X86/avx-sext.ll +++ /dev/null @@ -1,199 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s -check-prefix=SSSE3 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=pentium4 | FileCheck %s -check-prefix=SSE2 - -define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { -; AVX: sext_8i16_to_8i32 -; AVX: vpmovsxwd - - %B = sext <8 x i16> %A to <8 x i32> - ret <8 x i32>%B -} - -define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { -; AVX: sext_4i32_to_4i64 -; AVX: vpmovsxdq - - %B = sext <4 x i32> %A to <4 x i64> - ret <4 x i64>%B -} - -; AVX: load_sext_test1 -; AVX: vpmovsxwd (%r{{[^,]*}}), %xmm{{.*}} -; AVX: ret - -; SSSE3: load_sext_test1 -; SSSE3: movq -; SSSE3: punpcklwd %xmm{{.*}}, %xmm{{.*}} -; SSSE3: psrad $16 -; SSSE3: ret - -; SSE2: load_sext_test1 -; SSE2: movq -; SSE2: punpcklwd %xmm{{.*}}, %xmm{{.*}} -; SSE2: psrad $16 -; SSE2: ret -define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) { - %X = load <4 x i16>* %ptr - %Y = sext <4 x i16> %X to <4 x i32> - ret <4 x i32>%Y -} - -; AVX: load_sext_test2 -; AVX: vpmovsxbd (%r{{[^,]*}}), %xmm{{.*}} -; AVX: ret - -; SSSE3: load_sext_test2 -; SSSE3: movd -; SSSE3: pshufb -; SSSE3: psrad $24 -; SSSE3: ret - -; SSE2: load_sext_test2 -; SSE2: movl -; SSE2: psrad $24 -; SSE2: ret -define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) { - %X = load <4 x i8>* %ptr - %Y = sext <4 x i8> %X to <4 x i32> - ret <4 x i32>%Y -} - -; AVX: load_sext_test3 -; AVX: vpmovsxbq (%r{{[^,]*}}), %xmm{{.*}} -; AVX: ret - -; SSSE3: load_sext_test3 -; SSSE3: movsbq -; SSSE3: movsbq -; SSSE3: punpcklqdq -; SSSE3: ret - -; SSE2: load_sext_test3 -; SSE2: movsbq -; SSE2: movsbq -; SSE2: punpcklqdq -; SSE2: ret -define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) { - %X = load <2 x i8>* %ptr - %Y = sext <2 x i8> %X to <2 x i64> - ret <2 x i64>%Y -} - -; AVX: load_sext_test4 -; AVX: vpmovsxwq (%r{{[^,]*}}), %xmm{{.*}} -; AVX: ret - -; SSSE3: load_sext_test4 -; SSSE3: movswq -; SSSE3: movswq -; SSSE3: punpcklqdq -; SSSE3: ret - -; SSE2: load_sext_test4 -; SSE2: movswq -; SSE2: movswq -; SSE2: punpcklqdq -; SSE2: ret -define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) { - %X = load <2 x i16>* %ptr - %Y = sext <2 x i16> %X to <2 x i64> - ret <2 x i64>%Y -} - -; AVX: load_sext_test5 -; AVX: vpmovsxdq (%r{{[^,]*}}), %xmm{{.*}} -; AVX: ret - -; SSSE3: load_sext_test5 -; SSSE3: movslq -; SSSE3: movslq -; SSSE3: punpcklqdq -; SSSE3: ret - -; SSE2: load_sext_test5 -; SSE2: movslq -; SSE2: movslq -; SSE2: punpcklqdq -; SSE2: ret -define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) { - %X = load <2 x i32>* %ptr - %Y = sext <2 x i32> %X to <2 x i64> - ret <2 x i64>%Y -} - -; AVX: load_sext_test6 -; AVX: vpmovsxbw (%r{{[^,]*}}), %xmm{{.*}} -; AVX: ret - -; SSSE3: load_sext_test6 -; SSSE3: movq -; SSSE3: punpcklbw -; SSSE3: psraw $8 -; SSSE3: ret - -; SSE2: load_sext_test6 -; SSE2: movq -; SSE2: punpcklbw -; SSE2: psraw $8 -; SSE2: ret -define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) { - %X = load <8 x i8>* %ptr - %Y = sext <8 x i8> %X to <8 x i16> - ret <8 x i16>%Y -} - -; AVX: sext_4i1_to_4i64 -; AVX: vpslld $31 -; AVX: vpsrad $31 -; AVX: vpmovsxdq -; AVX: vpmovsxdq -; AVX: ret -define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { - %extmask = sext <4 x i1> %mask to <4 x i64> - ret <4 x i64> %extmask -} - -; AVX-LABEL: sext_16i8_to_16i16 -; AVX: vpmovsxbw -; AVX: vmovhlps -; AVX: vpmovsxbw -; AVX: ret -define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { - %X = load <16 x i8>* %ptr - %Y = sext <16 x i8> %X to <16 x i16> - ret <16 x i16> %Y -} - -; AVX: sext_4i8_to_4i64 -; AVX: vpslld $24 -; AVX: vpsrad $24 -; AVX: vpmovsxdq -; AVX: vpmovsxdq -; AVX: ret -define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { - %extmask = sext <4 x i8> %mask to <4 x i64> - ret <4 x i64> %extmask -} - -; AVX: sext_4i8_to_4i64 -; AVX: vpmovsxbd -; AVX: vpmovsxdq -; AVX: vpmovsxdq -; AVX: ret -define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { - %X = load <4 x i8>* %ptr - %Y = sext <4 x i8> %X to <4 x i64> - ret <4 x i64>%Y -} - -; AVX: sext_4i16_to_4i64 -; AVX: vpmovsxwd -; AVX: vpmovsxdq -; AVX: vpmovsxdq -; AVX: ret -define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { - %X = load <4 x i16>* %ptr - %Y = sext <4 x i16> %X to <4 x i64> - ret <4 x i64>%Y -} diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll deleted file mode 100644 index 4a996d79815c..000000000000 --- a/test/CodeGen/X86/avx-shuffle.ll +++ /dev/null @@ -1,336 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; PR11102 -define <4 x float> @test1(<4 x float> %a) nounwind { - %b = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 5, i32 undef, i32 undef> - ret <4 x float> %b -; CHECK-LABEL: test1: -;; TODO: This test could be improved by removing the xor instruction and -;; having vinsertps zero out the needed elements. -; CHECK: vxorps -; CHECK: vinsertps -} - -; rdar://10538417 -define <3 x i64> @test2(<2 x i64> %v) nounwind readnone { -; CHECK-LABEL: test2: -; CHECK: vinsertf128 - %1 = shufflevector <2 x i64> %v, <2 x i64> %v, <3 x i32> <i32 0, i32 1, i32 undef> - %2 = shufflevector <3 x i64> zeroinitializer, <3 x i64> %1, <3 x i32> <i32 3, i32 4, i32 2> - ret <3 x i64> %2 -; CHECK: ret -} - -define <4 x i64> @test3(<4 x i64> %a, <4 x i64> %b) nounwind { - %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 undef> - ret <4 x i64> %c -; CHECK-LABEL: test3: -; CHECK: vblendpd -; CHECK: ret -} - -define <8 x float> @test4(float %a) nounwind { - %b = insertelement <8 x float> zeroinitializer, float %a, i32 0 - ret <8 x float> %b -; CHECK-LABEL: test4: -; CHECK: vinsertf128 -} - -; rdar://10594409 -define <8 x float> @test5(float* nocapture %f) nounwind uwtable readonly ssp { -entry: - %0 = bitcast float* %f to <4 x float>* - %1 = load <4 x float>* %0, align 16 -; CHECK: test5 -; CHECK: vmovaps -; CHECK-NOT: vxorps -; CHECK-NOT: vinsertf128 - %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4> - ret <8 x float> %shuffle.i -} - -define <4 x double> @test6(double* nocapture %d) nounwind uwtable readonly ssp { -entry: - %0 = bitcast double* %d to <2 x double>* - %1 = load <2 x double>* %0, align 16 -; CHECK: test6 -; CHECK: vmovaps -; CHECK-NOT: vxorps -; CHECK-NOT: vinsertf128 - %shuffle.i = shufflevector <2 x double> %1, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2> - ret <4 x double> %shuffle.i -} - -define <16 x i16> @test7(<4 x i16> %a) nounwind { -; CHECK: test7 - %b = shufflevector <4 x i16> %a, <4 x i16> undef, <16 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> -; CHECK: ret - ret <16 x i16> %b -} - -; CHECK: test8 -define void @test8() { -entry: - %0 = load <16 x i64> addrspace(1)* null, align 128 - %1 = shufflevector <16 x i64> <i64 undef, i64 undef, i64 0, i64 undef, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 undef, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> %0, <16 x i32> <i32 17, i32 18, i32 2, i32 undef, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 26> - %2 = shufflevector <16 x i64> %1, <16 x i64> %0, <16 x i32> <i32 0, i32 1, i32 2, i32 30, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 undef, i32 11, i32 undef, i32 22, i32 20, i32 15> - store <16 x i64> %2, <16 x i64> addrspace(1)* undef, align 128 -; CHECK: ret - ret void -} - -; Extract a value from a shufflevector.. -define i32 @test9(<4 x i32> %a) nounwind { -; CHECK: test9 -; CHECK: vpextrd - %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 undef, i32 4> - %r = extractelement <8 x i32> %b, i32 2 -; CHECK: ret - ret i32 %r -} - -; Extract a value which is the result of an undef mask. -define i32 @test10(<4 x i32> %a) nounwind { -; CHECK: @test10 -; CHECK-NOT: {{^[^#]*[a-z]}} -; CHECK: ret - %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %r = extractelement <8 x i32> %b, i32 2 - ret i32 %r -} - -define <4 x float> @test11(<4 x float> %a) nounwind { -; CHECK: test11 -; CHECK: vpshufd $27 - %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> - ret <4 x float> %tmp1 -} - -define <4 x float> @test12(<4 x float>* %a) nounwind { -; CHECK: test12 -; CHECK: vpshufd - %tmp0 = load <4 x float>* %a - %tmp1 = shufflevector <4 x float> %tmp0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> - ret <4 x float> %tmp1 -} - -define <4 x i32> @test13(<4 x i32> %a) nounwind { -; CHECK: test13 -; CHECK: vpshufd $27 - %tmp1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> - ret <4 x i32> %tmp1 -} - -define <4 x i32> @test14(<4 x i32>* %a) nounwind { -; CHECK: test14 -; CHECK: vpshufd $27, ( - %tmp0 = load <4 x i32>* %a - %tmp1 = shufflevector <4 x i32> %tmp0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> - ret <4 x i32> %tmp1 -} - -; CHECK: test15 -; CHECK: vpshufd $8 -; CHECK: ret -define <4 x i32> @test15(<2 x i32>%x) nounwind readnone { - %x1 = shufflevector <2 x i32> %x, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> - ret <4 x i32>%x1 -} - -; rdar://10974078 -define <8 x float> @test16(float* nocapture %f) nounwind uwtable readonly ssp { -entry: - %0 = bitcast float* %f to <4 x float>* - %1 = load <4 x float>* %0, align 8 -; CHECK: test16 -; CHECK: vmovups -; CHECK-NOT: vxorps -; CHECK-NOT: vinsertf128 - %shuffle.i = shufflevector <4 x float> %1, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4> - ret <8 x float> %shuffle.i -} - -; PR12413 -; CHECK: shuf1 -; CHECK: vpshufb -; CHECK: vpshufb -; CHECK: vpshufb -; CHECK: vpshufb -define <32 x i8> @shuf1(<32 x i8> %inval1, <32 x i8> %inval2) { -entry: - %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> - ret <32 x i8> %0 -} - -; handle the case where only half of the 256-bits is splittable -; CHECK: shuf2 -; CHECK: vpshufb -; CHECK: vpshufb -; CHECK: vpextrb -; CHECK: vpextrb -define <32 x i8> @shuf2(<32 x i8> %inval1, <32 x i8> %inval2) { -entry: - %0 = shufflevector <32 x i8> %inval1, <32 x i8> %inval2, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 31, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62> - ret <32 x i8> %0 -} - -; CHECK: blend1 -; CHECK: vblendps -; CHECK: ret -define <4 x i32> @blend1(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { - %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> - ret <4 x i32> %t -} - -; CHECK: blend2 -; CHECK: vblendps -; CHECK: ret -define <4 x i32> @blend2(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { - %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x i32> %t -} - -; CHECK: blend2a -; CHECK: vblendps -; CHECK: ret -define <4 x float> @blend2a(<4 x float> %a, <4 x float> %b) nounwind alwaysinline { - %t = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x float> %t -} - -; CHECK: blend3 -; CHECK-NOT: vblendps -; CHECK: ret -define <4 x i32> @blend3(<4 x i32> %a, <4 x i32> %b) nounwind alwaysinline { - %t = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 2, i32 7> - ret <4 x i32> %t -} - -; CHECK: blend4 -; CHECK: vblendpd -; CHECK: ret -define <4 x i64> @blend4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline { - %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> - ret <4 x i64> %t -} - -; CHECK: narrow -; CHECK: vpermilps -; CHECK: ret -define <16 x i16> @narrow(<16 x i16> %a) nounwind alwaysinline { - %t = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 undef, i32 14, i32 15, i32 undef, i32 undef> - ret <16 x i16> %t -} - -;CHECK-LABEL: test17: -;CHECK-NOT: vinsertf128 -;CHECK: ret -define <8 x float> @test17(<4 x float> %y) { - %x = shufflevector <4 x float> %y, <4 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x float> %x -} - -; CHECK: test18 -; CHECK: vmovshdup -; CHECK: vblendps -; CHECK: ret -define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind { - %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> - ret <8 x float>%S -} - -; CHECK: test19 -; CHECK: vmovsldup -; CHECK: vblendps -; CHECK: ret -define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind { - %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> - ret <8 x float>%S -} - -; rdar://12684358 -; Make sure loads happen before stores. -; CHECK: swap8doubles -; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}} -; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}} -; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}} -; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}} -; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}} -; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}} -; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi) -; CHECK: vextractf128 -; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi) -; CHECK: vextractf128 -; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi) -; CHECK: vmovaps %ymm{{[0-9]+}}, {{[0-9]*}}(%rsi) -define void @swap8doubles(double* nocapture %A, double* nocapture %C) nounwind uwtable ssp { -entry: - %add.ptr = getelementptr inbounds double* %A, i64 2 - %v.i = bitcast double* %A to <2 x double>* - %0 = load <2 x double>* %v.i, align 1 - %shuffle.i.i = shufflevector <2 x double> %0, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2> - %v1.i = bitcast double* %add.ptr to <2 x double>* - %1 = load <2 x double>* %v1.i, align 1 - %2 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i, <2 x double> %1, i8 1) nounwind - %add.ptr1 = getelementptr inbounds double* %A, i64 6 - %add.ptr2 = getelementptr inbounds double* %A, i64 4 - %v.i27 = bitcast double* %add.ptr2 to <2 x double>* - %3 = load <2 x double>* %v.i27, align 1 - %shuffle.i.i28 = shufflevector <2 x double> %3, <2 x double> <double 0.000000e+00, double undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 2> - %v1.i29 = bitcast double* %add.ptr1 to <2 x double>* - %4 = load <2 x double>* %v1.i29, align 1 - %5 = tail call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %shuffle.i.i28, <2 x double> %4, i8 1) nounwind - %6 = bitcast double* %C to <4 x double>* - %7 = load <4 x double>* %6, align 32 - %add.ptr5 = getelementptr inbounds double* %C, i64 4 - %8 = bitcast double* %add.ptr5 to <4 x double>* - %9 = load <4 x double>* %8, align 32 - %shuffle.i26 = shufflevector <4 x double> %7, <4 x double> undef, <2 x i32> <i32 0, i32 1> - %10 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %7, i8 1) - %shuffle.i = shufflevector <4 x double> %9, <4 x double> undef, <2 x i32> <i32 0, i32 1> - %11 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %9, i8 1) - store <2 x double> %shuffle.i26, <2 x double>* %v.i, align 16 - store <2 x double> %10, <2 x double>* %v1.i, align 16 - store <2 x double> %shuffle.i, <2 x double>* %v.i27, align 16 - store <2 x double> %11, <2 x double>* %v1.i29, align 16 - store <4 x double> %2, <4 x double>* %6, align 32 - store <4 x double> %5, <4 x double>* %8, align 32 - ret void -} -declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone -declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone - -; this test case just should not fail -define void @test20() { - %a0 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double 0.000000e+00, i32 2 - store <3 x double> %a0, <3 x double>* undef, align 1 - %a1 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double undef, i32 2 - store <3 x double> %a1, <3 x double>* undef, align 1 - ret void -} - -define <2 x i64> @test_insert_64_zext(<2 x i64> %i) { -; CHECK-LABEL: test_insert_64_zext -; CHECK-NOT: xor -; CHECK: vmovq - %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2> - ret <2 x i64> %1 -} - -;; Ensure we don't use insertps from non v4x32 vectors. -;; On SSE4.1 it works because bigger vectors use more than 1 register. -;; On AVX they get passed in a single register. -;; FIXME: We could probably optimize this case, if we're only using the -;; first 4 indices. -define <4 x i32> @insert_from_diff_size(<8 x i32> %x) { -; CHECK-LABEL: insert_from_diff_size: -; CHECK-NOT: insertps -; CHECK: ret - %vecext = extractelement <8 x i32> %x, i32 0 - %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 - %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 - %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2 - %a.0 = extractelement <8 x i32> %x, i32 0 - %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a.0, i32 3 - ret <4 x i32> %vecinit3 -} diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll index b1b2f8b97a73..98c1645b9080 100644 --- a/test/CodeGen/X86/avx-splat.ll +++ b/test/CodeGen/X86/avx-splat.ll @@ -1,9 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s -; CHECK: vpunpcklbw %xmm -; CHECK-NEXT: vpunpckhbw %xmm -; CHECK-NEXT: vpshufd $85 +; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; CHECK-NEXT: vinsertf128 $1 define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp { entry: @@ -11,8 +9,7 @@ entry: ret <32 x i8> %shuffle } -; CHECK: vpunpckhwd %xmm -; CHECK-NEXT: vpshufd $85 +; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11] ; CHECK-NEXT: vinsertf128 $1 define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp { entry: @@ -21,7 +18,7 @@ entry: } ; CHECK: vmovq -; CHECK-NEXT: vmovlhps %xmm +; CHECK-NEXT: vunpcklpd %xmm ; CHECK-NEXT: vinsertf128 $1 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp { entry: @@ -32,7 +29,7 @@ entry: ret <4 x i64> %vecinit6.i } -; CHECK: vpermilpd $0 +; CHECK: vunpcklpd %xmm ; CHECK-NEXT: vinsertf128 $1 define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp { entry: @@ -72,7 +69,7 @@ __load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_ex ret <8 x float> %load_broadcast12281250 } -; CHECK: vpshufd $0 +; CHECK: vpermilps $4 ; CHECK-NEXT: vinsertf128 $1 define <8 x float> @funcF(i32 %val) nounwind { %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6 @@ -81,7 +78,7 @@ define <8 x float> @funcF(i32 %val) nounwind { ret <8 x float> %tmp } -; CHECK: vpshufd $0 +; CHECK: vpermilps $0 ; CHECK-NEXT: vinsertf128 $1 define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp { entry: @@ -90,7 +87,7 @@ entry: } ; CHECK: vextractf128 $1 -; CHECK-NEXT: vpshufd +; CHECK-NEXT: vpermilps $85 ; CHECK-NEXT: vinsertf128 $1 define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp { entry: diff --git a/test/CodeGen/X86/avx-vmovddup.ll b/test/CodeGen/X86/avx-vmovddup.ll deleted file mode 100644 index 1c56fe2b1a01..000000000000 --- a/test/CodeGen/X86/avx-vmovddup.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: vmovddup %ymm -define <4 x i64> @A(<4 x i64> %a) { - %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> - ret <4 x i64> %c -} - -; CHECK: vmovddup (% -define <4 x i64> @B(<4 x i64>* %ptr) { - %a = load <4 x i64>* %ptr - %c = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> - ret <4 x i64> %c -} diff --git a/test/CodeGen/X86/avx-vperm2f128.ll b/test/CodeGen/X86/avx-vperm2f128.ll deleted file mode 100644 index c20775bacad2..000000000000 --- a/test/CodeGen/X86/avx-vperm2f128.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: _A -; CHECK: vperm2f128 $1 -define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> - ret <8 x float> %shuffle -} - -; CHECK: _B -; CHECK: vblendps $240 -define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> - ret <8 x float> %shuffle -} - -; CHECK: _C -; CHECK: vperm2f128 $0 -define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> - ret <8 x float> %shuffle -} - -; CHECK: _D -; CHECK: vperm2f128 $17 -define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> - ret <8 x float> %shuffle -} - -; CHECK: _E -; CHECK: vperm2f128 $17 -define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> - ret <32 x i8> %shuffle -} - -; CHECK: _E2 -; CHECK: vperm2f128 $3 -define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x i64> %shuffle -} - -;;;; Cases with undef indicies mixed in the mask - -; CHECK: _F -; CHECK: vperm2f128 $33 -define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11> - ret <8 x float> %shuffle -} - -;;;; Cases we must not select vperm2f128 - -; CHECK: _G -; CHECK-NOT: vperm2f128 -define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15> - ret <8 x float> %shuffle -} diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll new file mode 100644 index 000000000000..43303ca57c4f --- /dev/null +++ b/test/CodeGen/X86/avx-vperm2x128.ll @@ -0,0 +1,193 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 + +define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: A: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> + ret <8 x float> %shuffle +} + +define <8 x float> @B(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: B: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> + ret <8 x float> %shuffle +} + +define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: C: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> + ret <8 x float> %shuffle +} + +define <8 x float> @D(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: D: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: E: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <32 x i8> %shuffle +} + +define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: E2: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i64> %shuffle +} + +define <32 x i8> @Ei(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: Ei: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: Ei: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: retq +entry: + ; add forces execution domain + %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> + %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <32 x i8> %shuffle +} + +define <4 x i64> @E2i(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: E2i: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: E2i: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] +; AVX2-NEXT: retq +entry: + ; add forces execution domain + %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> + %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i64> %shuffle +} + +define <8 x i32> @E3i(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: E3i: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: E3i: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: retq +entry: + ; add forces execution domain + %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15> + ret <8 x i32> %shuffle +} + +define <16 x i16> @E4i(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: E4i: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: E4i: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +entry: + ; add forces execution domain + %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <16 x i16> %shuffle +} + +define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { +; AVX1-LABEL: E5i: +; AVX1: ## BB#0: ## %entry +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovaps (%rsi), %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: E5i: +; AVX2: ## BB#0: ## %entry +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm1 +; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +entry: + %c = load <16 x i16>* %a + %d = load <16 x i16>* %b + %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <16 x i16> %shuffle +} + +;;;; Cases with undef indicies mixed in the mask + +define <8 x float> @F(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: F: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1,0,1] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11> + ret <8 x float> %shuffle +} + +;;;; Cases we must not select vperm2f128 + +define <8 x float> @G(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { +; ALL-LABEL: G: +; ALL: ## BB#0: ## %entry +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; ALL-NEXT: retq +entry: + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15> + ret <8 x float> %shuffle +} diff --git a/test/CodeGen/X86/avx-vpermil.ll b/test/CodeGen/X86/avx-vpermil.ll deleted file mode 100644 index b7f8d72e58c9..000000000000 --- a/test/CodeGen/X86/avx-vpermil.ll +++ /dev/null @@ -1,54 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: vpermilps -define <8 x float> @funcA(<8 x float> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7, i32 5> - ret <8 x float> %shuffle -} - -; CHECK: vpermilpd -define <4 x double> @funcB(<4 x double> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3> - ret <4 x double> %shuffle -} - -; CHECK: vpermilps -define <8 x i32> @funcC(<8 x i32> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7, i32 5> - ret <8 x i32> %shuffle -} - -; CHECK: vpermilpd -define <4 x i64> @funcD(<4 x i64> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3> - ret <4 x i64> %shuffle -} - -; CHECK: vpermilpd -define <4 x i64> @funcQ(<4 x i64>* %a) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x i64>* %a - %shuffle = shufflevector <4 x i64> %a2, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3> - ret <4 x i64> %shuffle -} - -; vpermil should match masks like this: <u,3,1,2,4,u,5,6>. Check that the -; target specific mask was correctly generated. -; CHECK: vpermilps $-100 -define <8 x float> @funcE(<8 x float> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 8, i32 3, i32 1, i32 2, i32 4, i32 8, i32 5, i32 6> - ret <8 x float> %shuffle -} - -; CHECK: palignr $8 -; CHECK: palignr $8 -define <8 x float> @funcF(<8 x float> %a) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> - ret <8 x float> %shuffle -} diff --git a/test/CodeGen/X86/avx-vshufp.ll b/test/CodeGen/X86/avx-vshufp.ll deleted file mode 100644 index ad3dbc1ed893..000000000000 --- a/test/CodeGen/X86/avx-vshufp.ll +++ /dev/null @@ -1,157 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -; CHECK: vshufps $-53, %ymm -define <8 x float> @A(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15> - ret <8 x float> %shuffle -} - -; CHECK: vshufps $-53, (%{{.*}}), %ymm -define <8 x float> @A2(<8 x float>* %a, <8 x float>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <8 x float>* %a - %b2 = load <8 x float>* %b - %shuffle = shufflevector <8 x float> %a2, <8 x float> %b2, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15> - ret <8 x float> %shuffle -} - -; CHECK: vshufps $-53, %ymm -define <8 x i32> @A3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15> - ret <8 x i32> %shuffle -} - -; CHECK: vshufps $-53, (%{{.*}}), %ymm -define <8 x i32> @A4(<8 x i32>* %a, <8 x i32>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <8 x i32>* %a - %b2 = load <8 x i32>* %b - %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b2, <8 x i32> <i32 3, i32 2, i32 8, i32 11, i32 7, i32 6, i32 12, i32 15> - ret <8 x i32> %shuffle -} - -; CHECK: vblendpd $10, %ymm -define <4 x double> @B(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x double> %shuffle -} - -; CHECK: vblendpd $10, (%{{.*}}), %ymm -define <4 x double> @B2(<4 x double>* %a, <4 x double>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x double>* %a - %b2 = load <4 x double>* %b - %shuffle = shufflevector <4 x double> %a2, <4 x double> %b2, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x double> %shuffle -} - -; CHECK: vblendpd $10, %ymm -define <4 x i64> @B3(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x i64> %shuffle -} - -; CHECK: vblendpd $10, (%{{.*}}), %ymm -define <4 x i64> @B4(<4 x i64>* %a, <4 x i64>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x i64>* %a - %b2 = load <4 x i64>* %b - %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b2, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x i64> %shuffle -} - -; CHECK: vshufps $-53, %ymm -define <8 x float> @C(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 undef, i32 undef, i32 11, i32 undef, i32 6, i32 12, i32 undef> - ret <8 x float> %shuffle -} - -; CHECK: vblendpd $2, %ymm -define <4 x double> @D(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 undef> - ret <4 x double> %shuffle -} - -; CHECK: vshufps $-55, %ymm -define <8 x float> @E(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 10, i32 0, i32 3, i32 13, i32 14, i32 4, i32 7> - ret <8 x float> %shuffle -} - -; CHECK: vshufpd $8, %ymm -define <4 x double> @F(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 7> - ret <4 x double> %shuffle -} - -; CHECK: vshufps $-53, %xmm -define <4 x float> @A128(<4 x float> %a, <4 x float> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 4, i32 7> - ret <4 x float> %shuffle -} - -; CHECK: vshufps $-53, (%{{.*}}), %xmm -define <4 x float> @A2128(<4 x float>* %a, <4 x float>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x float>* %a - %b2 = load <4 x float>* %b - %shuffle = shufflevector <4 x float> %a2, <4 x float> %b2, <4 x i32> <i32 3, i32 2, i32 4, i32 7> - ret <4 x float> %shuffle -} - -; CHECK: vshufps $-53, %xmm -define <4 x i32> @A3128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 4, i32 7> - ret <4 x i32> %shuffle -} - -; CHECK: vshufps $-53, (%{{.*}}), %xmm -define <4 x i32> @A4128(<4 x i32>* %a, <4 x i32>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <4 x i32>* %a - %b2 = load <4 x i32>* %b - %shuffle = shufflevector <4 x i32> %a2, <4 x i32> %b2, <4 x i32> <i32 3, i32 2, i32 4, i32 7> - ret <4 x i32> %shuffle -} - -; CHECK: vshufpd $1, %xmm -define <2 x double> @B128(<2 x double> %a, <2 x double> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 2> - ret <2 x double> %shuffle -} - -; CHECK: vshufpd $1, (%{{.*}}), %xmm -define <2 x double> @B2128(<2 x double>* %a, <2 x double>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <2 x double>* %a - %b2 = load <2 x double>* %b - %shuffle = shufflevector <2 x double> %a2, <2 x double> %b2, <2 x i32> <i32 1, i32 2> - ret <2 x double> %shuffle -} - -; CHECK: vshufpd $1, %xmm -define <2 x i64> @B3128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { -entry: - %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2> - ret <2 x i64> %shuffle -} - -; CHECK: vshufpd $1, (%{{.*}}), %xmm -define <2 x i64> @B4128(<2 x i64>* %a, <2 x i64>* %b) nounwind uwtable readnone ssp { -entry: - %a2 = load <2 x i64>* %a - %b2 = load <2 x i64>* %b - %shuffle = shufflevector <2 x i64> %a2, <2 x i64> %b2, <2 x i32> <i32 1, i32 2> - ret <2 x i64> %shuffle -} diff --git a/test/CodeGen/X86/avx-zext.ll b/test/CodeGen/X86/avx-zext.ll deleted file mode 100755 index 75117463bc39..000000000000 --- a/test/CodeGen/X86/avx-zext.ll +++ /dev/null @@ -1,41 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s - -define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { -;CHECK-LABEL: zext_8i16_to_8i32: -;CHECK: vpunpckhwd -;CHECK: ret - - %B = zext <8 x i16> %A to <8 x i32> - ret <8 x i32>%B -} - -define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { -;CHECK-LABEL: zext_4i32_to_4i64: -;CHECK: vpunpckhdq -;CHECK: ret - - %B = zext <4 x i32> %A to <4 x i64> - ret <4 x i64>%B -} - -define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) { -;CHECK-LABEL: zext_8i8_to_8i32: -;CHECK: vpunpckhwd -;CHECK: vpmovzxwd -;CHECK: vinsertf128 -;CHECK: ret - %t = zext <8 x i8> %z to <8 x i32> - ret <8 x i32> %t -} - -; PR17654 -define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) { -; CHECK-LABEL: zext_16i8_to_16i16: -; CHECK: vpxor -; CHECK: vpunpckhbw -; CHECK: vpunpcklbw -; CHECK: vinsertf128 -; CHECK: ret - %t = zext <16 x i8> %z to <16 x i16> - ret <16 x i16> %t -} diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll index 6069c14f0d80..cba6d98f5a84 100644 --- a/test/CodeGen/X86/avx.ll +++ b/test/CodeGen/X86/avx.ll @@ -60,7 +60,7 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa ; X32: movl 8(%esp), %ecx ; CHECK-NOT: mov ;; Try to match a bit more of the instr, since we need the load's offset. -; CHECK: vinsertps $192, 12(%{{...}},%{{...}}), % +; CHECK: vinsertps $-64, 12(%{{...}},%{{...}}), % ; CHECK-NEXT: ret %1 = getelementptr inbounds <4 x float>* %pb, i64 %index %2 = load <4 x float>* %1, align 16 diff --git a/test/CodeGen/X86/avx1-stack-reload-folding.ll b/test/CodeGen/X86/avx1-stack-reload-folding.ll new file mode 100644 index 000000000000..54c192583d6e --- /dev/null +++ b/test/CodeGen/X86/avx1-stack-reload-folding.ll @@ -0,0 +1,83 @@ +; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; Stack reload folding tests - we use the 'big vectors' pattern to guarantee spilling to stack. +; +; Many of these tests are primarily to check memory folding with specific instructions. Using a basic +; load/cvt/store pattern to test for this would mean that it wouldn't be the memory folding code thats +; being tested - the load-execute version of the instruction from the tables would be matched instead. + +define void @stack_fold_vmulpd(<64 x double>* %a, <64 x double>* %b, <64 x double>* %c) { + ;CHECK-LABEL: stack_fold_vmulpd + ;CHECK: vmulpd {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + + %1 = load <64 x double>* %a + %2 = load <64 x double>* %b + %3 = fadd <64 x double> %1, %2 + %4 = fsub <64 x double> %1, %2 + %5 = fmul <64 x double> %3, %4 + store <64 x double> %5, <64 x double>* %c + ret void +} + +define void @stack_fold_cvtdq2ps(<128 x i32>* %a, <128 x i32>* %b, <128 x float>* %c) { + ;CHECK-LABEL: stack_fold_cvtdq2ps + ;CHECK: vcvtdq2ps {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + + %1 = load <128 x i32>* %a + %2 = load <128 x i32>* %b + %3 = and <128 x i32> %1, %2 + %4 = xor <128 x i32> %1, %2 + %5 = sitofp <128 x i32> %3 to <128 x float> + %6 = sitofp <128 x i32> %4 to <128 x float> + %7 = fadd <128 x float> %5, %6 + store <128 x float> %7, <128 x float>* %c + ret void +} + +define void @stack_fold_cvtpd2ps(<128 x double>* %a, <128 x double>* %b, <128 x float>* %c) { + ;CHECK-LABEL: stack_fold_cvtpd2ps + ;CHECK: vcvtpd2psy {{[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + + %1 = load <128 x double>* %a + %2 = load <128 x double>* %b + %3 = fadd <128 x double> %1, %2 + %4 = fsub <128 x double> %1, %2 + %5 = fptrunc <128 x double> %3 to <128 x float> + %6 = fptrunc <128 x double> %4 to <128 x float> + %7 = fadd <128 x float> %5, %6 + store <128 x float> %7, <128 x float>* %c + ret void +} + +define void @stack_fold_cvttpd2dq(<64 x double>* %a, <64 x double>* %b, <64 x i32>* %c) #0 { + ;CHECK-LABEL: stack_fold_cvttpd2dq + ;CHECK: vcvttpd2dqy {{[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + + %1 = load <64 x double>* %a + %2 = load <64 x double>* %b + %3 = fadd <64 x double> %1, %2 + %4 = fsub <64 x double> %1, %2 + %5 = fptosi <64 x double> %3 to <64 x i32> + %6 = fptosi <64 x double> %4 to <64 x i32> + %7 = or <64 x i32> %5, %6 + store <64 x i32> %7, <64 x i32>* %c + ret void +} + +define void @stack_fold_cvttps2dq(<128 x float>* %a, <128 x float>* %b, <128 x i32>* %c) #0 { + ;CHECK-LABEL: stack_fold_cvttps2dq + ;CHECK: vcvttps2dq {{[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + + %1 = load <128 x float>* %a + %2 = load <128 x float>* %b + %3 = fadd <128 x float> %1, %2 + %4 = fsub <128 x float> %1, %2 + %5 = fptosi <128 x float> %3 to <128 x i32> + %6 = fptosi <128 x float> %4 to <128 x i32> + %7 = or <128 x i32> %5, %6 + store <128 x i32> %7, <128 x i32>* %c + ret void +} diff --git a/test/CodeGen/X86/avx2-blend.ll b/test/CodeGen/X86/avx2-blend.ll deleted file mode 100644 index b02442b6fadd..000000000000 --- a/test/CodeGen/X86/avx2-blend.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s - -define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) { -; CHECK-LABEL: constant_pblendvb_avx2: -; CHECK: vmovdqa -; CHECK: vpblendvb - %1 = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd - ret <32 x i8> %1 -} - -declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll new file mode 100644 index 000000000000..ac2c73bb9321 --- /dev/null +++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -0,0 +1,33 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=core-avx2 -mattr=avx2 | FileCheck %s + +define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) { + ; CHECK: vpblendw + %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind readnone + + +define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) { + ; CHECK: vpblendd + %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) { + ; CHECK: vpblendd + %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone + + +define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { + ; CHECK: vmpsadbw + %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone + diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index ab3d591e1d9f..79a3361bfe86 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -161,7 +161,7 @@ declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) { - ; CHECK: vpslldq + ; CHECK: vpslldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -169,7 +169,7 @@ declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) { - ; CHECK: vpslldq + ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24] %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -257,7 +257,7 @@ declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) { - ; CHECK: vpsrldq + ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -265,7 +265,7 @@ declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) { - ; CHECK: vpsrldq + ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] ret <4 x i64> %res } @@ -475,10 +475,10 @@ declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK: vmpsadbw - %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1] + %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } -declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone +declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) { @@ -499,10 +499,10 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounw define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK: vpblendw - %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1] + %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7) ; <<16 x i16>> [#uses=1] ret <16 x i16> %res } -declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind readnone +declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) { @@ -706,18 +706,18 @@ declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind re define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK: vpblendd - %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1] + %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1] ret <4 x i32> %res } -declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind readnone +declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK: vpblendd - %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1] + %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1] ret <8 x i32> %res } -declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone +declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) { diff --git a/test/CodeGen/X86/avx2-nontemporal.ll b/test/CodeGen/X86/avx2-nontemporal.ll index 0768aae48e8c..4d28a979712a 100644 --- a/test/CodeGen/X86/avx2-nontemporal.ll +++ b/test/CodeGen/X86/avx2-nontemporal.ll @@ -19,4 +19,4 @@ define void @f(<8 x float> %A, i8* %B, <4 x double> %C, i32 %D, <4 x i64> %E) { ret void } -!0 = metadata !{i32 1} +!0 = !{i32 1} diff --git a/test/CodeGen/X86/avx2-palignr.ll b/test/CodeGen/X86/avx2-palignr.ll deleted file mode 100644 index 83573dc7b260..000000000000 --- a/test/CodeGen/X86/avx2-palignr.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s - -define <8 x i32> @test1(<8 x i32> %A, <8 x i32> %B) nounwind { -; CHECK-LABEL: test1: -; CHECK: vpalignr $4 - %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12> - ret <8 x i32> %C -} - -define <8 x i32> @test2(<8 x i32> %A, <8 x i32> %B) nounwind { -; CHECK-LABEL: test2: -; CHECK: vpalignr $4 - %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 undef, i32 12> - ret <8 x i32> %C -} - -define <8 x i32> @test3(<8 x i32> %A, <8 x i32> %B) nounwind { -; CHECK-LABEL: test3: -; CHECK: vpalignr $4 - %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 1, i32 undef, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12> - ret <8 x i32> %C -} -; -define <8 x i32> @test4(<8 x i32> %A, <8 x i32> %B) nounwind { -; CHECK-LABEL: test4: -; CHECK: vpalignr $8 - %C = shufflevector <8 x i32> %A, <8 x i32> %B, <8 x i32> <i32 10, i32 11, i32 undef, i32 1, i32 14, i32 15, i32 4, i32 5> - ret <8 x i32> %C -} - -define <16 x i16> @test5(<16 x i16> %A, <16 x i16> %B) nounwind { -; CHECK-LABEL: test5: -; CHECK: vpalignr $6 - %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 3, i32 4, i32 undef, i32 6, i32 7, i32 16, i32 17, i32 18, i32 11, i32 12, i32 13, i32 undef, i32 15, i32 24, i32 25, i32 26> - ret <16 x i16> %C -} - -define <16 x i16> @test6(<16 x i16> %A, <16 x i16> %B) nounwind { -; CHECK-LABEL: test6: -; CHECK: vpalignr $6 - %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 11, i32 12, i32 13, i32 undef, i32 15, i32 24, i32 25, i32 26> - ret <16 x i16> %C -} - -define <16 x i16> @test7(<16 x i16> %A, <16 x i16> %B) nounwind { -; CHECK-LABEL: test7: -; CHECK: vpalignr $6 - %C = shufflevector <16 x i16> %A, <16 x i16> %B, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <16 x i16> %C -} - -define <32 x i8> @test8(<32 x i8> %A, <32 x i8> %B) nounwind { -; CHECK-LABEL: test8: -; CHECK: vpalignr $5 - %C = shufflevector <32 x i8> %A, <32 x i8> %B, <32 x i32> <i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52> - ret <32 x i8> %C -} diff --git a/test/CodeGen/X86/avx2-pmovx-256-old-shuffle.ll b/test/CodeGen/X86/avx2-pmovx-256-old-shuffle.ll new file mode 100644 index 000000000000..44eb42adb9f8 --- /dev/null +++ b/test/CodeGen/X86/avx2-pmovx-256-old-shuffle.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -x86-experimental-vector-shuffle-lowering=false -mattr=+avx2 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-darwin" + +; PR21876 +; The old shuffle lowering sometimes generates VZEXT nodes with both input +; and output same-sized types, here 256-bits. For instance, a v8i8 to v8i32 +; zero-extend would become a (v8i32 (VZEXT v32i8)) node, which can't happen +; otherwise. The companion commit r223996 added those patterns temporarily. +; This test, along with the VR256 for AVX2 PMOVXrr instructions, should be +; removed once the old vector shuffle lowering goes away. + +define void @test_avx2_pmovx_256(<8 x i8>* %tmp64, <8 x float>* %tmp75) { +; CHECK-LABEL: test_avx2_pmovx_256 +; We really don't care about the generated code. +; CHECK: vpmovzxbd +; CHECK: vpbroadcastd +; CHECK: vpand +; CHECK: vcvtdq2ps +; CHECK: vmovups +; CHECK: vzeroupper +; CHECK: retq + + %wide.load458 = load <8 x i8>* %tmp64, align 1 + %tmp68 = uitofp <8 x i8> %wide.load458 to <8 x float> + store <8 x float> %tmp68, <8 x float>* %tmp75, align 4 + ret void +} diff --git a/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll b/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll new file mode 100644 index 000000000000..7301b7cbfc4e --- /dev/null +++ b/test/CodeGen/X86/avx2-pmovxrm-intrinsics.ll @@ -0,0 +1,110 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 | FileCheck %s + +define <16 x i16> @test_lvm_x86_avx2_pmovsxbw(<16 x i8>* %a) { +; CHECK-LABEL: test_lvm_x86_avx2_pmovsxbw +; CHECK: vpmovsxbw (%rdi), %ymm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %1) + ret <16 x i16> %2 +} + +define <8 x i32> @test_llvm_x86_avx2_pmovsxbd(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovsxbd +; CHECK: vpmovsxbd (%rdi), %ymm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @test_llvm_x86_avx2_pmovsxbq(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovsxbq +; CHECK: vpmovsxbq (%rdi), %ymm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %1) + ret <4 x i64> %2 +} + +define <8 x i32> @test_llvm_x86_avx2_pmovsxwd(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovsxwd +; CHECK: vpmovsxwd (%rdi), %ymm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @test_llvm_x86_avx2_pmovsxwq(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovsxwq +; CHECK: vpmovsxwq (%rdi), %ymm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %1) + ret <4 x i64> %2 +} + +define <4 x i64> @test_llvm_x86_avx2_pmovsxdq(<4 x i32>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovsxdq +; CHECK: vpmovsxdq (%rdi), %ymm0 + %1 = load <4 x i32>* %a, align 1 + %2 = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %1) + ret <4 x i64> %2 +} + +define <16 x i16> @test_lvm_x86_avx2_pmovzxbw(<16 x i8>* %a) { +; CHECK-LABEL: test_lvm_x86_avx2_pmovzxbw +; CHECK: vpmovzxbw (%rdi), %ymm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %1) + ret <16 x i16> %2 +} + +define <8 x i32> @test_llvm_x86_avx2_pmovzxbd(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovzxbd +; CHECK: vpmovzxbd (%rdi), %ymm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @test_llvm_x86_avx2_pmovzxbq(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovzxbq +; CHECK: vpmovzxbq (%rdi), %ymm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %1) + ret <4 x i64> %2 +} + +define <8 x i32> @test_llvm_x86_avx2_pmovzxwd(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovzxwd +; CHECK: vpmovzxwd (%rdi), %ymm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %1) + ret <8 x i32> %2 +} + +define <4 x i64> @test_llvm_x86_avx2_pmovzxwq(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovzxwq +; CHECK: vpmovzxwq (%rdi), %ymm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %1) + ret <4 x i64> %2 +} + +define <4 x i64> @test_llvm_x86_avx2_pmovzxdq(<4 x i32>* %a) { +; CHECK-LABEL: test_llvm_x86_avx2_pmovzxdq +; CHECK: vpmovzxdq (%rdi), %ymm0 + %1 = load <4 x i32>* %a, align 1 + %2 = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %1) + ret <4 x i64> %2 +} + +declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) +declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) +declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) +declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) +declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) +declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) +declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) +declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) +declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) +declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) +declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) +declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll deleted file mode 100644 index 185b989458ae..000000000000 --- a/test/CodeGen/X86/avx2-shuffle.ll +++ /dev/null @@ -1,127 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s - -; Make sure that we don't match this shuffle using the vpblendw YMM instruction. -; The mask for the vpblendw instruction needs to be identical for both halves -; of the YMM. Need to use two vpblendw instructions. - -; CHECK: vpblendw_test1 -; mask = 10010110,b = 150,d -; CHECK: vpblendw $150, %ymm -; CHECK: ret -define <16 x i16> @vpblendw_test1(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline { - %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 23, - i32 8, i32 25, i32 26, i32 11, i32 28, i32 13, i32 14, i32 31> - ret <16 x i16> %t -} - -; CHECK: vpblendw_test2 -; mask1 = 00010110 = 22 -; mask2 = 10000000 = 128 -; CHECK: vpblendw $128, %xmm -; CHECK: vpblendw $22, %xmm -; CHECK: vinserti128 -; CHECK: ret -define <16 x i16> @vpblendw_test2(<16 x i16> %a, <16 x i16> %b) nounwind alwaysinline { - %t = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 20, i32 5, i32 6, i32 7, - i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31> - ret <16 x i16> %t -} - -; CHECK: blend_test1 -; CHECK: vpblendd -; CHECK: ret -define <8 x i32> @blend_test1(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline { - %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7> - ret <8 x i32> %t -} - -; CHECK: blend_test2 -; CHECK: vpblendd -; CHECK: ret -define <8 x i32> @blend_test2(<8 x i32> %a, <8 x i32> %b) nounwind alwaysinline { - %t = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7> - ret <8 x i32> %t -} - - -; CHECK: blend_test3 -; CHECK: vblendps -; CHECK: ret -define <8 x float> @blend_test3(<8 x float> %a, <8 x float> %b) nounwind alwaysinline { - %t = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 12, i32 5, i32 6, i32 7> - ret <8 x float> %t -} - -; CHECK: blend_test4 -; CHECK: vblendpd -; CHECK: ret -define <4 x i64> @blend_test4(<4 x i64> %a, <4 x i64> %b) nounwind alwaysinline { - %t = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3> - ret <4 x i64> %t -} - -;; 2 tests for shufflevectors that optimize to blend + immediate -; CHECK-LABEL: @blend_test5 -; CHECK: vpblendd $10, %xmm1, %xmm0, %xmm0 -; CHECK: ret -define <4 x i32> @blend_test5(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x i32> %1 -} - -; CHECK-LABEL: @blend_test6 -; CHECK: vpblendw $134, %ymm1, %ymm0, %ymm0 -; CHECK: ret -define <16 x i16> @blend_test6(<16 x i16> %a, <16 x i16> %b) { - %1 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 23, - i32 8, i32 25, i32 26, i32 11, i32 12, i32 13, i32 14, i32 31> - ret <16 x i16> %1 -} - -; CHECK: vpshufhw $27, %ymm -define <16 x i16> @vpshufhw(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpshuflw $27, %ymm -define <16 x i16> @vpshuflw(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpshufb_test -; CHECK: vpshufb {{.*\(%r.*}}, %ymm -; CHECK: ret -define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind { - %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, - i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, - i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, - i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18> - ret <32 x i8>%S -} - -; CHECK: vpshufb1_test -; CHECK: vpshufb {{.*\(%r.*}}, %ymm -; CHECK: ret -define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind { - %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, - i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15, - i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, - i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18> - ret <32 x i8>%S -} - - -; CHECK: vpshufb2_test -; CHECK: vpshufb {{.*\(%r.*}}, %ymm -; CHECK: ret -define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind { - %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, - i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15, - i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, - i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18> - ret <32 x i8>%S -} diff --git a/test/CodeGen/X86/avx2-unpack.ll b/test/CodeGen/X86/avx2-unpack.ll deleted file mode 100644 index 6d17443489ae..000000000000 --- a/test/CodeGen/X86/avx2-unpack.ll +++ /dev/null @@ -1,86 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s - -; CHECK: vpunpckhdq -define <8 x i32> @unpackhidq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> - ret <8 x i32> %shuffle.i -} - -; CHECK: vpunpckhqdq -define <4 x i64> @unpackhiqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7> - ret <4 x i64> %shuffle.i -} - -; CHECK: vpunpckldq -define <8 x i32> @unpacklodq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> - ret <8 x i32> %shuffle.i -} - -; CHECK: vpunpcklqdq -define <4 x i64> @unpacklqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6> - ret <4 x i64> %shuffle.i -} - -; CHECK: vpunpckhwd -define <16 x i16> @unpackhwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpunpcklwd -define <16 x i16> @unpacklwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpunpckhbw -define <32 x i8> @unpackhbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> - ret <32 x i8> %shuffle.i -} - -; CHECK: vpunpcklbw -define <32 x i8> @unpacklbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> - ret <32 x i8> %shuffle.i -} - -; CHECK: vpunpckhdq -define <8 x i32> @unpackhidq1_undef(<8 x i32> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> - ret <8 x i32> %shuffle.i -} - -; CHECK: vpunpckhqdq -define <4 x i64> @unpackhiqdq1_undef(<4 x i64> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src1, <4 x i32> <i32 1, i32 5, i32 3, i32 7> - ret <4 x i64> %shuffle.i -} - -; CHECK: vpunpckhwd -define <16 x i16> @unpackhwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> - ret <16 x i16> %shuffle.i -} - -; CHECK: vpunpcklwd -define <16 x i16> @unpacklwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp { -entry: - %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> - ret <16 x i16> %shuffle.i -} - diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll index 66f586d23d14..924c06eba768 100644 --- a/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/test/CodeGen/X86/avx2-vbroadcast.ll @@ -317,7 +317,7 @@ define <4 x double> @_inreg4xdouble(<4 x double> %a) { } ;CHECK-LABEL: _inreg2xdouble: -;CHECK: vpbroadcastq +;CHECK: vunpcklpd ;CHECK: ret define <2 x double> @_inreg2xdouble(<2 x double> %a) { %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer diff --git a/test/CodeGen/X86/avx2-vperm2i128.ll b/test/CodeGen/X86/avx2-vperm2i128.ll deleted file mode 100644 index 1937db5d7c16..000000000000 --- a/test/CodeGen/X86/avx2-vperm2i128.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s - -; CHECK: vperm2i128 $17 -define <32 x i8> @E(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { -entry: - ; add forces execution domain - %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> - %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> - ret <32 x i8> %shuffle -} - -; CHECK: vperm2i128 $3 -define <4 x i64> @E2(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { -entry: - ; add forces execution domain - %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> - %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x i64> %shuffle -} - -; CHECK: vperm2i128 $49 -define <8 x i32> @E3(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { -entry: - ; add forces execution domain - %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> - %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15> - ret <8 x i32> %shuffle -} - -; CHECK: vperm2i128 $2 -define <16 x i16> @E4(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp { -entry: - ; add forces execution domain - %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> - %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - ret <16 x i16> %shuffle -} - -; CHECK: vperm2i128 $2, (% -define <16 x i16> @E5(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { -entry: - %c = load <16 x i16>* %a - %d = load <16 x i16>* %b - %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> - %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - ret <16 x i16> %shuffle -} diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll index 4d1c9f7cd973..94b08215b896 100644 --- a/test/CodeGen/X86/avx512-arith.ll +++ b/test/CodeGen/X86/avx512-arith.ll @@ -1,189 +1,217 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -; CHECK-LABEL: addpd512 -; CHECK: vaddpd -; CHECK: ret define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) { +; CHECK-LABEL: addpd512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %add.i = fadd <8 x double> %x, %y ret <8 x double> %add.i } -; CHECK-LABEL: addpd512fold -; CHECK: vaddpd LCP{{.*}}(%rip) -; CHECK: ret define <8 x double> @addpd512fold(<8 x double> %y) { +; CHECK-LABEL: addpd512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %add.i = fadd <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.800000e+00, double 2.300000e+00, double 1.200000e+00> ret <8 x double> %add.i } -; CHECK-LABEL: addps512 -; CHECK: vaddps -; CHECK: ret define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) { +; CHECK-LABEL: addps512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %add.i = fadd <16 x float> %x, %y ret <16 x float> %add.i } -; CHECK-LABEL: addps512fold -; CHECK: vaddps LCP{{.*}}(%rip) -; CHECK: ret define <16 x float> @addps512fold(<16 x float> %y) { +; CHECK-LABEL: addps512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %add.i = fadd <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 4.500000e+00, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000> ret <16 x float> %add.i } -; CHECK-LABEL: subpd512 -; CHECK: vsubpd -; CHECK: ret define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) { +; CHECK-LABEL: subpd512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vsubpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %sub.i = fsub <8 x double> %x, %y ret <8 x double> %sub.i } -; CHECK-LABEL: @subpd512fold -; CHECK: vsubpd (% -; CHECK: ret define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) { +; CHECK-LABEL: subpd512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vsubpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %tmp2 = load <8 x double>* %x, align 8 %sub.i = fsub <8 x double> %y, %tmp2 ret <8 x double> %sub.i } -; CHECK-LABEL: @subps512 -; CHECK: vsubps -; CHECK: ret define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) { +; CHECK-LABEL: subps512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vsubps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %sub.i = fsub <16 x float> %x, %y ret <16 x float> %sub.i } -; CHECK-LABEL: subps512fold -; CHECK: vsubps (% -; CHECK: ret define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) { +; CHECK-LABEL: subps512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vsubps (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %tmp2 = load <16 x float>* %x, align 4 %sub.i = fsub <16 x float> %y, %tmp2 ret <16 x float> %sub.i } -; CHECK-LABEL: imulq512 -; CHECK: vpmuludq -; CHECK: vpmuludq -; CHECK: ret define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) { +; CHECK-LABEL: imulq512: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 +; CHECK-NEXT: vpsrlq $32, %zmm0, %zmm3 +; CHECK-NEXT: vpmuludq %zmm3, %zmm1, %zmm3 +; CHECK-NEXT: vpsllq $32, %zmm3, %zmm3 +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 +; CHECK-NEXT: vpsrlq $32, %zmm1, %zmm1 +; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq %z = mul <8 x i64>%x, %y ret <8 x i64>%z } -; CHECK-LABEL: mulpd512 -; CHECK: vmulpd -; CHECK: ret define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) { +; CHECK-LABEL: mulpd512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vmulpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %mul.i = fmul <8 x double> %x, %y ret <8 x double> %mul.i } -; CHECK-LABEL: mulpd512fold -; CHECK: vmulpd LCP{{.*}}(%rip) -; CHECK: ret define <8 x double> @mulpd512fold(<8 x double> %y) { +; CHECK-LABEL: mulpd512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vmulpd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %mul.i = fmul <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00> ret <8 x double> %mul.i } -; CHECK-LABEL: mulps512 -; CHECK: vmulps -; CHECK: ret define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) { +; CHECK-LABEL: mulps512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vmulps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %mul.i = fmul <16 x float> %x, %y ret <16 x float> %mul.i } -; CHECK-LABEL: mulps512fold -; CHECK: vmulps LCP{{.*}}(%rip) -; CHECK: ret define <16 x float> @mulps512fold(<16 x float> %y) { +; CHECK-LABEL: mulps512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %mul.i = fmul <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000> ret <16 x float> %mul.i } -; CHECK-LABEL: divpd512 -; CHECK: vdivpd -; CHECK: ret define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) { +; CHECK-LABEL: divpd512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %div.i = fdiv <8 x double> %x, %y ret <8 x double> %div.i } -; CHECK-LABEL: divpd512fold -; CHECK: vdivpd LCP{{.*}}(%rip) -; CHECK: ret define <8 x double> @divpd512fold(<8 x double> %y) { +; CHECK-LABEL: divpd512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vdivpd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %div.i = fdiv <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00> ret <8 x double> %div.i } -; CHECK-LABEL: divps512 -; CHECK: vdivps -; CHECK: ret define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) { +; CHECK-LABEL: divps512: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq entry: %div.i = fdiv <16 x float> %x, %y ret <16 x float> %div.i } -; CHECK-LABEL: divps512fold -; CHECK: vdivps LCP{{.*}}(%rip) -; CHECK: ret define <16 x float> @divps512fold(<16 x float> %y) { +; CHECK-LABEL: divps512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vdivps {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %div.i = fdiv <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000> ret <16 x float> %div.i } -; CHECK-LABEL: vpaddq_test -; CHECK: vpaddq %zmm -; CHECK: ret define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone { +; CHECK-LABEL: vpaddq_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = add <8 x i64> %i, %j ret <8 x i64> %x } -; CHECK-LABEL: vpaddq_fold_test -; CHECK: vpaddq (% -; CHECK: ret define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind { +; CHECK-LABEL: vpaddq_fold_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq %tmp = load <8 x i64>* %j, align 4 %x = add <8 x i64> %i, %tmp ret <8 x i64> %x } -; CHECK-LABEL: vpaddq_broadcast_test -; CHECK: vpaddq LCP{{.*}}(%rip){1to8} -; CHECK: ret define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind { +; CHECK-LABEL: vpaddq_broadcast_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = add <8 x i64> %i, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> ret <8 x i64> %x } -; CHECK-LABEL: vpaddq_broadcast2_test -; CHECK: vpaddq (%rdi){1to8} -; CHECK: ret define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind { +; CHECK-LABEL: vpaddq_broadcast2_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq %tmp = load i64* %j %j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0 %j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1 @@ -197,55 +225,67 @@ define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind { ret <8 x i64> %x } -; CHECK-LABEL: vpaddd_test -; CHECK: vpaddd %zmm -; CHECK: ret define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone { +; CHECK-LABEL: vpaddd_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = add <16 x i32> %i, %j ret <16 x i32> %x } -; CHECK-LABEL: vpaddd_fold_test -; CHECK: vpaddd (% -; CHECK: ret define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind { +; CHECK-LABEL: vpaddd_fold_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq %tmp = load <16 x i32>* %j, align 4 %x = add <16 x i32> %i, %tmp ret <16 x i32> %x } -; CHECK-LABEL: vpaddd_broadcast_test -; CHECK: vpaddd LCP{{.*}}(%rip){1to16} -; CHECK: ret define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind { +; CHECK-LABEL: vpaddd_broadcast_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ret <16 x i32> %x } -; CHECK-LABEL: vpaddd_mask_test -; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }} -; CHECK: ret define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone { +; CHECK-LABEL: vpaddd_mask_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} +; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, %j %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i ret <16 x i32> %r } -; CHECK-LABEL: vpaddd_maskz_test -; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z} }} -; CHECK: ret define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone { +; CHECK-LABEL: vpaddd_maskz_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3 +; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, %j %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %r } -; CHECK-LABEL: vpaddd_mask_fold_test -; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }} -; CHECK: ret define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone { +; CHECK-LABEL: vpaddd_mask_fold_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} +; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer %j = load <16 x i32>* %j.ptr %x = add <16 x i32> %i, %j @@ -253,20 +293,26 @@ define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 ret <16 x i32> %r } -; CHECK-LABEL: vpaddd_mask_broadcast_test -; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }} -; CHECK: ret define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone { +; CHECK-LABEL: vpaddd_mask_broadcast_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} +; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i ret <16 x i32> %r } -; CHECK-LABEL: vpaddd_maskz_fold_test -; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z} -; CHECK: ret define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone { +; CHECK-LABEL: vpaddd_maskz_fold_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer %j = load <16 x i32>* %j.ptr %x = add <16 x i32> %i, %j @@ -274,125 +320,141 @@ define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 ret <16 x i32> %r } -; CHECK-LABEL: vpaddd_maskz_broadcast_test -; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z} -; CHECK: ret define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone { +; CHECK-LABEL: vpaddd_maskz_broadcast_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 +; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %r } -; CHECK-LABEL: vpsubq_test -; CHECK: vpsubq %zmm -; CHECK: ret define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone { +; CHECK-LABEL: vpsubq_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = sub <8 x i64> %i, %j ret <8 x i64> %x } -; CHECK-LABEL: vpsubd_test -; CHECK: vpsubd -; CHECK: ret define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone { +; CHECK-LABEL: vpsubd_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = sub <16 x i32> %i, %j ret <16 x i32> %x } -; CHECK-LABEL: vpmulld_test -; CHECK: vpmulld %zmm -; CHECK: ret define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) { +; CHECK-LABEL: vpmulld_test: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %x = mul <16 x i32> %i, %j ret <16 x i32> %x } -; CHECK-LABEL: sqrtA -; CHECK: vsqrtss {{.*}} encoding: [0x62 -; CHECK: ret declare float @sqrtf(float) readnone define float @sqrtA(float %a) nounwind uwtable readnone ssp { +; CHECK-LABEL: sqrtA: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq entry: %conv1 = tail call float @sqrtf(float %a) nounwind readnone ret float %conv1 } -; CHECK-LABEL: sqrtB -; CHECK: vsqrtsd {{.*}}## encoding: [0x62 -; CHECK: ret declare double @sqrt(double) readnone define double @sqrtB(double %a) nounwind uwtable readnone ssp { +; CHECK-LABEL: sqrtB: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq entry: %call = tail call double @sqrt(double %a) nounwind readnone ret double %call } -; CHECK-LABEL: sqrtC -; CHECK: vsqrtss {{.*}}## encoding: [0x62 -; CHECK: ret declare float @llvm.sqrt.f32(float) define float @sqrtC(float %a) nounwind { +; CHECK-LABEL: sqrtC: +; CHECK: ## BB#0: +; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: retq %b = call float @llvm.sqrt.f32(float %a) ret float %b } -; CHECK-LABEL: sqrtD -; CHECK: vsqrtps {{.*}} -; CHECK: ret declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) define <16 x float> @sqrtD(<16 x float> %a) nounwind { +; CHECK-LABEL: sqrtD: +; CHECK: ## BB#0: +; CHECK-NEXT: vsqrtps %zmm0, %zmm0 +; CHECK-NEXT: retq %b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a) ret <16 x float> %b } -; CHECK-LABEL: sqrtE -; CHECK: vsqrtpd {{.*}} -; CHECK: ret declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) define <8 x double> @sqrtE(<8 x double> %a) nounwind { +; CHECK-LABEL: sqrtE: +; CHECK: ## BB#0: +; CHECK-NEXT: vsqrtpd %zmm0, %zmm0 +; CHECK-NEXT: retq %b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a) ret <8 x double> %b } -; CHECK-LABEL: fadd_broadcast -; CHECK: LCP{{.*}}(%rip){1to16}, %zmm0, %zmm0 -; CHECK: ret define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind { +; CHECK-LABEL: fadd_broadcast: +; CHECK: ## BB#0: +; CHECK-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: retq %b = fadd <16 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000> ret <16 x float> %b } -; CHECK-LABEL: addq_broadcast -; CHECK: vpaddq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0 -; CHECK: ret define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind { +; CHECK-LABEL: addq_broadcast: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq %b = add <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> ret <8 x i64> %b } -; CHECK-LABEL: orq_broadcast -; CHECK: vporq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0 -; CHECK: ret define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind { +; CHECK-LABEL: orq_broadcast: +; CHECK: ## BB#0: +; CHECK-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> ret <8 x i64> %b } -; CHECK-LABEL: andd512fold -; CHECK: vpandd (% -; CHECK: ret define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { +; CHECK-LABEL: andd512fold: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpandd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %a = load <16 x i32>* %x, align 4 %b = and <16 x i32> %y, %a ret <16 x i32> %b } -; CHECK-LABEL: andqbrst -; CHECK: vpandq (%rdi){1to8}, %zmm -; CHECK: ret define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) { +; CHECK-LABEL: andqbrst: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: retq entry: %a = load i64* %ap, align 8 %b = insertelement <8 x i64> undef, i64 %a, i32 0 @@ -400,3 +462,193 @@ entry: %d = and <8 x i64> %p1, %c ret <8 x i64>%d } + +; CHECK-LABEL: test_mask_vaddps +; CHECK: vaddps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i, + <16 x float> %j, <16 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %x = fadd <16 x float> %i, %j + %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst + ret <16 x float> %r +} + +; CHECK-LABEL: test_mask_vmulps +; CHECK: vmulps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, + <16 x float> %j, <16 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %x = fmul <16 x float> %i, %j + %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst + ret <16 x float> %r +} + +; CHECK-LABEL: test_mask_vminps +; CHECK: vminps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, + <16 x float> %j, <16 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %cmp_res = fcmp olt <16 x float> %i, %j + %min = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j + %r = select <16 x i1> %mask, <16 x float> %min, <16 x float> %dst + ret <16 x float> %r +} + +; CHECK-LABEL: test_mask_vminpd +; CHECK: vminpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, + <8 x double> %j, <8 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %cmp_res = fcmp olt <8 x double> %i, %j + %min = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j + %r = select <8 x i1> %mask, <8 x double> %min, <8 x double> %dst + ret <8 x double> %r +} + +; CHECK-LABEL: test_mask_vmaxps +; CHECK: vmaxps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, + <16 x float> %j, <16 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %cmp_res = fcmp ogt <16 x float> %i, %j + %max = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j + %r = select <16 x i1> %mask, <16 x float> %max, <16 x float> %dst + ret <16 x float> %r +} + +; CHECK-LABEL: test_mask_vmaxpd +; CHECK: vmaxpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, + <8 x double> %j, <8 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %cmp_res = fcmp ogt <8 x double> %i, %j + %max = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j + %r = select <8 x i1> %mask, <8 x double> %max, <8 x double> %dst + ret <8 x double> %r +} + +; CHECK-LABEL: test_mask_vsubps +; CHECK: vsubps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, + <16 x float> %j, <16 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %x = fsub <16 x float> %i, %j + %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst + ret <16 x float> %r +} + +; CHECK-LABEL: test_mask_vdivps +; CHECK: vdivps {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, + <16 x float> %j, <16 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %x = fdiv <16 x float> %i, %j + %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst + ret <16 x float> %r +} + +; CHECK-LABEL: test_mask_vaddpd +; CHECK: vaddpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, + <8 x double> %j, <8 x i64> %mask1) + nounwind readnone { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %x = fadd <8 x double> %i, %j + %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst + ret <8 x double> %r +} + +; CHECK-LABEL: test_maskz_vaddpd +; CHECK: vaddpd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z}}} +; CHECK: ret +define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, + <8 x i64> %mask1) nounwind readnone { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %x = fadd <8 x double> %i, %j + %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer + ret <8 x double> %r +} + +; CHECK-LABEL: test_mask_fold_vaddpd +; CHECK: vaddpd (%rdi), {{.*%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}.*}} +; CHECK: ret +define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, + <8 x double>* %j, <8 x i64> %mask1) + nounwind { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %tmp = load <8 x double>* %j, align 8 + %x = fadd <8 x double> %i, %tmp + %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst + ret <8 x double> %r +} + +; CHECK-LABEL: test_maskz_fold_vaddpd +; CHECK: vaddpd (%rdi), {{.*%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z}.*}} +; CHECK: ret +define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, + <8 x i64> %mask1) nounwind { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %tmp = load <8 x double>* %j, align 8 + %x = fadd <8 x double> %i, %tmp + %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer + ret <8 x double> %r +} + +; CHECK-LABEL: test_broadcast_vaddpd +; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*}} +; CHECK: ret +define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind { + %tmp = load double* %j + %b = insertelement <8 x double> undef, double %tmp, i32 0 + %c = shufflevector <8 x double> %b, <8 x double> undef, + <8 x i32> zeroinitializer + %x = fadd <8 x double> %c, %i + ret <8 x double> %x +} + +; CHECK-LABEL: test_mask_broadcast_vaddpd +; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*{%k[1-7]}.*}} +; CHECK: ret +define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i, + double* %j, <8 x i64> %mask1) nounwind { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %tmp = load double* %j + %b = insertelement <8 x double> undef, double %tmp, i32 0 + %c = shufflevector <8 x double> %b, <8 x double> undef, + <8 x i32> zeroinitializer + %x = fadd <8 x double> %c, %i + %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %i + ret <8 x double> %r +} + +; CHECK-LABEL: test_maskz_broadcast_vaddpd +; CHECK: vaddpd (%rdi){1to8}, %zmm{{.*{%k[1-7]} {z}.*}} +; CHECK: ret +define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j, + <8 x i64> %mask1) nounwind { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %tmp = load double* %j + %b = insertelement <8 x double> undef, double %tmp, i32 0 + %c = shufflevector <8 x double> %b, <8 x double> undef, + <8 x i32> zeroinitializer + %x = fadd <8 x double> %c, %i + %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer + ret <8 x double> %r +} diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll index b5a2aa80ce16..9e9ad31c916f 100644 --- a/test/CodeGen/X86/avx512-build-vector.ll +++ b/test/CodeGen/X86/avx512-build-vector.ll @@ -1,30 +1,43 @@ ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -; CHECK-LABEL: test1 -; CHECK: vpxord -; CHECK: ret define <16 x i32> @test1(i32* %x) { +; CHECK-LABEL: test1: +; CHECK: ## BB#0: +; CHECK-NEXT: vmovd (%rdi), %xmm0 +; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7] +; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %y = load i32* %x, align 4 %res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4 ret <16 x i32>%res } -; CHECK-LABEL: test2 -; CHECK: vpaddd LCP{{.*}}(%rip){1to16} -; CHECK: ret define <16 x i32> @test2(<16 x i32> %x) { +; CHECK-LABEL: test2: +; CHECK: ## BB#0: +; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: retq %res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x ret <16 x i32>%res } -; CHECK-LABEL: test3 -; CHECK: vinsertf128 -; CHECK: vinsertf64x4 -; CHECK: ret define <16 x float> @test3(<4 x float> %a) { +; CHECK-LABEL: test3: +; CHECK: ## BB#0: +; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vmovss %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vmovss %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,0],xmm0[0,1] +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %b = extractelement <4 x float> %a, i32 2 %c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5 %b1 = extractelement <4 x float> %a, i32 0 %c1 = insertelement <16 x float> %c, float %b1, i32 6 ret <16 x float>%c1 -}
\ No newline at end of file +} diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll index 47e50a93796a..6e0d18558c51 100644 --- a/test/CodeGen/X86/avx512-cmp.ll +++ b/test/CodeGen/X86/avx512-cmp.ll @@ -28,10 +28,9 @@ l2: ret float %c1 } +; FIXME: Can use vcmpeqss and extract from the mask here in AVX512. ; CHECK-LABEL: test3 -; CHECK: vcmpeqss -; CHECK: kmov -; CHECK: ret +; CHECK: vucomiss {{.*}}encoding: [0x62 define i32 @test3(float %a, float %b) { %cmp10.i = fcmp oeq float %a, %b @@ -86,3 +85,17 @@ define i32 @test8(i32 %a1, i32 %a2, i32 %a3) { %res = select i1 %tmp5, i32 1, i32 %a3 ret i32 %res } + +; CHECK-LABEL: test9 +; CHECK: testb +; CHECK-NOT: kmov +; CHECK: ret +define i32 @test9(i64 %a) { + %b = and i64 %a, 1 + %cmp10.i = icmp eq i64 %b, 0 + br i1 %cmp10.i, label %A, label %B +A: + ret i32 6 +B: + ret i32 7 +} diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll index f5cda96b99fa..2b672a72d539 100644 --- a/test/CodeGen/X86/avx512-cvt.ll +++ b/test/CodeGen/X86/avx512-cvt.ll @@ -255,3 +255,56 @@ define double @uitofp03(i32 %a) nounwind { %b = uitofp i32 %a to double ret double %b } + +; CHECK-LABEL: @sitofp_16i1_float +; CHECK: vpbroadcastd +; CHECK: vcvtdq2ps +define <16 x float> @sitofp_16i1_float(<16 x i32> %a) { + %mask = icmp slt <16 x i32> %a, zeroinitializer + %1 = sitofp <16 x i1> %mask to <16 x float> + ret <16 x float> %1 +} + +; CHECK-LABEL: @sitofp_16i8_float +; CHECK: vpmovsxbd +; CHECK: vcvtdq2ps +define <16 x float> @sitofp_16i8_float(<16 x i8> %a) { + %1 = sitofp <16 x i8> %a to <16 x float> + ret <16 x float> %1 +} + +; CHECK-LABEL: @sitofp_16i16_float +; CHECK: vpmovsxwd +; CHECK: vcvtdq2ps +define <16 x float> @sitofp_16i16_float(<16 x i16> %a) { + %1 = sitofp <16 x i16> %a to <16 x float> + ret <16 x float> %1 +} + +; CHECK-LABEL: @sitofp_8i16_double +; CHECK: vpmovsxwd +; CHECK: vcvtdq2pd +define <8 x double> @sitofp_8i16_double(<8 x i16> %a) { + %1 = sitofp <8 x i16> %a to <8 x double> + ret <8 x double> %1 +} + +; CHECK-LABEL: sitofp_8i8_double +; CHECK: vpmovzxwd +; CHECK: vpslld +; CHECK: vpsrad +; CHECK: vcvtdq2pd +define <8 x double> @sitofp_8i8_double(<8 x i8> %a) { + %1 = sitofp <8 x i8> %a to <8 x double> + ret <8 x double> %1 +} + + +; CHECK-LABEL: @sitofp_8i1_double +; CHECK: vpbroadcastq +; CHECK: vcvtdq2pd +define <8 x double> @sitofp_8i1_double(<8 x double> %a) { + %cmpres = fcmp ogt <8 x double> %a, zeroinitializer + %1 = sitofp <8 x i1> %cmpres to <8 x double> + ret <8 x double> %1 +} diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll index ce3d7590f396..116531d5af9e 100644 --- a/test/CodeGen/X86/avx512-fma-intrinsics.ll +++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll @@ -1,97 +1,184 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s define <16 x float> @test_x86_vfmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfmadd_ps_z ; CHECK: vfmadd213ps %zmm - %res = call <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <16 x float> @test_mask_vfmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfmadd_ps + ; CHECK: vfmadd213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone define <8 x double> @test_x86_vfmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmadd_pd_z ; CHECK: vfmadd213pd %zmm - %res = call <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + ret <8 x double> %res +} + +define <8 x double> @test_mask_fmadd_pd(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmadd_pd: +; CHECK: vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2] + %res = call <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask, i32 4) ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone + +declare <8 x double> @llvm.x86.fma.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) define <16 x float> @test_x86_vfmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfmsubps_z ; CHECK: vfmsub213ps %zmm - %res = call <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <16 x float> @test_mask_vfmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfmsub_ps + ; CHECK: vfmsub213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmsubpd_z ; CHECK: vfmsub213pd %zmm - %res = call <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + %res = call <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <8 x double> @test_mask_vfmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub_pd + ; CHECK: vfmsub213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfnmadd_ps_z ; CHECK: vfnmadd213ps %zmm - %res = call <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + %res = call <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd_ps + ; CHECK: vfnmadd213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfnmadd_pd_z ; CHECK: vfnmadd213pd %zmm - %res = call <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + %res = call <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd_pd + ; CHECK: vfnmadd213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfnmsubps_z ; CHECK: vfnmsub213ps %zmm - %res = call <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + %res = call <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub_ps + ; CHECK: vfnmsub213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfnmsubpd_z ; CHECK: vfnmsub213pd %zmm - %res = call <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub_pd + ; CHECK: vfnmsub213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfmaddsubps_z ; CHECK: vfmaddsub213ps %zmm - %res = call <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + ret <16 x float> %res +} + +define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) { +; CHECK-LABEL: test_mask_fmaddsub_ps: +; CHECK: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa6,0xc2] + %res = call <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4) ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone + +declare <16 x float> @llvm.x86.fma.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmaddsubpd_z ; CHECK: vfmaddsub213pd %zmm - %res = call <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + %res = call <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmaddsub_pd + ; CHECK: vfmaddsub213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { ; CHECK-LABEL: test_x86_vfmsubaddps_z ; CHECK: vfmsubadd213ps %zmm - %res = call <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind + %res = call <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <16 x float> @test_mask_vfmsubadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd_ps + ; CHECK: vfmsubadd213ps %zmm + %res = call <16 x float> @llvm.x86.fma.mask.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind ret <16 x float> %res } -declare <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { ; CHECK-LABEL: test_x86_vfmsubaddpd_z ; CHECK: vfmsubadd213pd %zmm - %res = call <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind + %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <8 x double> @test_mask_vfmsubadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd_pd + ; CHECK: vfmsubadd213pd %zmm + %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind ret <8 x double> %res } -declare <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone + diff --git a/test/CodeGen/X86/avx512-i1test.ll b/test/CodeGen/X86/avx512-i1test.ll new file mode 100755 index 000000000000..4814314a6442 --- /dev/null +++ b/test/CodeGen/X86/avx512-i1test.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +; ModuleID = 'bugpoint-reduced-simplified.bc' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: func +; CHECK: kortestw +; CHECK: kortestw +define void @func() { +bb1: + br i1 undef, label %L_10, label %L_10 + +L_10: ; preds = %bb1, %bb1 + br i1 undef, label %L_30, label %bb56 + +bb56: ; preds = %L_10 + br label %bb33 + +bb33: ; preds = %bb51, %bb56 + %r111 = load i64* undef, align 8 + br i1 undef, label %bb51, label %bb35 + +bb35: ; preds = %bb33 + br i1 undef, label %L_19, label %bb37 + +bb37: ; preds = %bb35 + %r128 = and i64 %r111, 576460752303423488 + %phitmp = icmp eq i64 %r128, 0 + br label %L_19 + +L_19: ; preds = %bb37, %bb35 + %"$V_S25.0" = phi i1 [ %phitmp, %bb37 ], [ true, %bb35 ] + br i1 undef, label %bb51, label %bb42 + +bb42: ; preds = %L_19 + %r136 = select i1 %"$V_S25.0", i32* undef, i32* undef + br label %bb51 + +bb51: ; preds = %bb42, %L_19, %bb33 + br i1 false, label %L_30, label %bb33 + +L_30: ; preds = %bb51, %L_10 + ret void +} diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index b360c716b004..eba895ebf565 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL --check-prefix=CHECK %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=CHECK %s ;CHECK-LABEL: test1: ;CHECK: vinsertps @@ -12,9 +13,11 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind { } ;CHECK-LABEL: test2: -;CHECK: vinsertf32x4 -;CHECK: vextractf32x4 -;CHECK: vinsertf32x4 +;KNL: vinsertf32x4 $0 +;SKX: vinsertf64x2 $0 +;CHECK: vextractf32x4 $3 +;KNL: vinsertf32x4 $3 +;SKX: vinsertf64x2 $3 ;CHECK: ret define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { %rrr = load double* %br @@ -24,8 +27,8 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind { } ;CHECK-LABEL: test3: -;CHECK: vextractf32x4 -;CHECK: vinsertf32x4 +;CHECK: vextractf32x4 $1 +;CHECK: vinsertf32x4 $0 ;CHECK: ret define <16 x float> @test3(<16 x float> %x) nounwind { %eee = extractelement <16 x float> %x, i32 4 @@ -34,8 +37,9 @@ define <16 x float> @test3(<16 x float> %x) nounwind { } ;CHECK-LABEL: test4: -;CHECK: vextracti32x4 -;CHECK: vinserti32x4 +;CHECK: vextracti32x4 $2 +;KNL: vinserti32x4 $0 +;SKX: vinserti64x2 $0 ;CHECK: ret define <8 x i64> @test4(<8 x i64> %x) nounwind { %eee = extractelement <8 x i64> %x, i32 4 @@ -186,12 +190,13 @@ define i16 @test16(i1 *%addr, i16 %a) { ;CHECK-LABEL: test17 ;CHECK: kshiftlw ;CHECK: kshiftrw -;CHECK: korw +;KNL: korw +;SKX: korb ;CHECK: ret define i8 @test17(i1 *%addr, i8 %a) { %x = load i1 * %addr, align 128 %a1 = bitcast i8 %a to <8 x i1> - %x1 = insertelement <8 x i1> %a1, i1 %x, i32 10 + %x1 = insertelement <8 x i1> %a1, i1 %x, i32 4 %x2 = bitcast <8 x i1>%x1 to i8 ret i8 %x2 } diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 18cfcfe78b05..7cd01683fa98 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -60,20 +60,6 @@ define <8 x double> @test_rcp_pd_512(<8 x double> %a0) { } declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone -define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) { - ; CHECK: vrcp28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0] - %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1] - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone - -define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) { - ; CHECK: vrcp28pd {sae}, {{.*}}encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0] - %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) ; <<8 x double>> [#uses=1] - ret <8 x double> %res -} -declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone - declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32) define <8 x double> @test7(<8 x double> %a) { @@ -97,13 +83,6 @@ define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) { } declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone -define <16 x float> @test_rsqrt28_ps_512(<16 x float> %a0) { - ; CHECK: vrsqrt28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0] - %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1] - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone - define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) { ; CHECK: vrsqrt14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4f,0xc0] %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1] @@ -111,13 +90,6 @@ define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) { } declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone -define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) { - ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0] - %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone - define <4 x float> @test_rcp14_ss(<4 x float> %a0) { ; CHECK: vrcp14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4d,0xc0] %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1] @@ -125,26 +97,19 @@ define <4 x float> @test_rcp14_ss(<4 x float> %a0) { } declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone -define <4 x float> @test_rcp28_ss(<4 x float> %a0) { - ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0] - %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone - define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) { ; CHECK: vsqrtpd - %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1] + %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) ; <<8 x double>> [#uses=1] ret <8 x double> %res } -declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>) nounwind readnone +declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) { ; CHECK: vsqrtps - %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1] + %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) ; <<16 x float>> [#uses=1] ret <16 x float> %res } -declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>) nounwind readnone +declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) { ; CHECK: vsqrtss {{.*}}encoding: [0x62 @@ -375,7 +340,7 @@ define <8 x i64> @test_ctlz_q(<8 x i64> %a) { declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) nounwind readonly define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) { - ; CHECK: vblendmps + ; CHECK: vblendmps %zmm1, %zmm0 %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1] ret <16 x float> %res } @@ -383,7 +348,7 @@ define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x declare <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float>, <16 x float>, i16) nounwind readonly define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) { - ; CHECK: vblendmpd + ; CHECK: vblendmpd %zmm1, %zmm0 %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1] ret <8 x double> %res } @@ -611,3 +576,800 @@ define <8 x i64> @test_vmovntdqa(i8 *%x) { } declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*) + +define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_valign_q: +; CHECK: valignq $2, %zmm1, %zmm0, %zmm0 + %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i8 2, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) { +; CHECK-LABEL: test_mask_valign_q: +; CHECK: valignq $2, %zmm1, %zmm0, %zmm2 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i8 2, <8 x i64> %src, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i8, <8 x i64>, i8) + +define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { +; CHECK-LABEL: test_maskz_valign_d: +; CHECK: valignd $5, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x03,0xc1,0x05] + %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i8 5, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i8, <16 x i32>, i16) + +define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) { + ; CHECK-LABEL: test_mask_store_ss + ; CHECK: vmovss %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x11,0x07] + call void @llvm.x86.avx512.mask.store.ss(i8* %ptr, <4 x float> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 ) + +define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) { +; CHECK-LABEL: test_pcmpeq_d +; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_d +; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16) + +define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_pcmpeq_q +; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_q +; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8) + +define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) { +; CHECK-LABEL: test_pcmpgt_d +; CHECK: vpcmpgtd %zmm1, %zmm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_d +; CHECK: vpcmpgtd %zmm1, %zmm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16) + +define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_pcmpgt_q +; CHECK: vpcmpgtq %zmm1, %zmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_q +; CHECK: vpcmpgtq %zmm1, %zmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8) + +define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { +; CHECK_LABEL: test_cmp_d_512 +; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ## + %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltd %zmm1, %zmm0, %k0 ## + %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpled %zmm1, %zmm0, %k0 ## + %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 ## + %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 ## + %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 ## + %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnled %zmm1, %zmm0, %k0 ## + %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordd %zmm1, %zmm0, %k0 ## + %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { +; CHECK_LABEL: test_mask_cmp_d_512 +; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltd %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpled %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnled %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordd %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone + +define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) { +; CHECK_LABEL: test_ucmp_d_512 +; CHECK: vpcmpequd %zmm1, %zmm0, %k0 ## + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltud %zmm1, %zmm0, %k0 ## + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleud %zmm1, %zmm0, %k0 ## + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 ## + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 ## + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 ## + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 ## + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordud %zmm1, %zmm0, %k0 ## + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { +; CHECK_LABEL: test_mask_ucmp_d_512 +; CHECK: vpcmpequd %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordud %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone + +define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) { +; CHECK_LABEL: test_cmp_q_512 +; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltq %zmm1, %zmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleq %zmm1, %zmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordq %zmm1, %zmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_cmp_q_512 +; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltq %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleq %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordq %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone + +define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) { +; CHECK_LABEL: test_ucmp_q_512 +; CHECK: vpcmpequq %zmm1, %zmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduq %zmm1, %zmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_ucmp_q_512 +; CHECK: vpcmpequq %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduq %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone + +define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) { +; CHECK-LABEL: test_mask_vextractf32x4: +; CHECK: vextractf32x4 $2, %zmm1, %xmm0 {%k1} + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i8 2, <4 x float> %b, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i8, <4 x float>, i8) + +define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) { +; CHECK-LABEL: test_mask_vextracti64x4: +; CHECK: vextracti64x4 $2, %zmm1, %ymm0 {%k1} + %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i8 2, <4 x i64> %b, i8 %mask) + ret <4 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i8, <4 x i64>, i8) + +define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) { +; CHECK-LABEL: test_maskz_vextracti32x4: +; CHECK: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z} + %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i8 2, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i8, <4 x i32>, i8) + +define <4 x double> @test_vextractf64x4(<8 x double> %a) { +; CHECK-LABEL: test_vextractf64x4: +; CHECK: vextractf64x4 $2, %zmm0, %ymm0 ## + %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i8 2, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i8, <4 x double>, i8) + +define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) { + ; CHECK-LABEL: test_x86_avx512_pslli_d + ; CHECK: vpslld + %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_pslli_d + ; CHECK: vpslld $7, %zmm0, %zmm1 {%k1} + %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_pslli_d + ; CHECK: vpslld $7, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) { + ; CHECK-LABEL: test_x86_avx512_pslli_q + ; CHECK: vpsllq + %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_pslli_q + ; CHECK: vpsllq $7, %zmm0, %zmm1 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q + ; CHECK: vpsllq $7, %zmm0, %zmm0 {%k1} {z} + %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) { + ; CHECK-LABEL: test_x86_avx512_psrli_d + ; CHECK: vpsrld + %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrli_d + ; CHECK: vpsrld $7, %zmm0, %zmm1 {%k1} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrli_d + ; CHECK: vpsrld $7, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) { + ; CHECK-LABEL: test_x86_avx512_psrli_q + ; CHECK: vpsrlq + %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrli_q + ; CHECK: vpsrlq $7, %zmm0, %zmm1 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q + ; CHECK: vpsrlq $7, %zmm0, %zmm0 {%k1} {z} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) { + ; CHECK-LABEL: test_x86_avx512_psrai_d + ; CHECK: vpsrad + %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrai_d + ; CHECK: vpsrad $7, %zmm0, %zmm1 {%k1} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrai_d + ; CHECK: vpsrad $7, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) { + ; CHECK-LABEL: test_x86_avx512_psrai_q + ; CHECK: vpsraq + %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrai_q + ; CHECK: vpsraq $7, %zmm0, %zmm1 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q + ; CHECK: vpsraq $7, %zmm0, %zmm0 {%k1} {z} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) { + ; CHECK-LABEL: test_x86_avx512_psll_d + ; CHECK: vpslld + %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psll_d + ; CHECK: vpslld %xmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psll_d + ; CHECK: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) { + ; CHECK-LABEL: test_x86_avx512_psll_q + ; CHECK: vpsllq + %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psll_q + ; CHECK: vpsllq %xmm1, %zmm0, %zmm2 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psll_q + ; CHECK: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1) { + ; CHECK-LABEL: test_x86_avx512_psrl_d + ; CHECK: vpsrld + %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrl_d + ; CHECK: vpsrld %xmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrl_d + ; CHECK: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) { + ; CHECK-LABEL: test_x86_avx512_psrl_q + ; CHECK: vpsrlq + %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrl_q + ; CHECK: vpsrlq %xmm1, %zmm0, %zmm2 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrl_q + ; CHECK: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1) { + ; CHECK-LABEL: test_x86_avx512_psra_d + ; CHECK: vpsrad + %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psra_d + ; CHECK: vpsrad %xmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psra_d + ; CHECK: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) { + ; CHECK-LABEL: test_x86_avx512_psra_q + ; CHECK: vpsraq + %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psra_q + ; CHECK: vpsraq %xmm1, %zmm0, %zmm2 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psra_q + ; CHECK: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1) { + ; CHECK-LABEL: test_x86_avx512_psllv_d + ; CHECK: vpsllvd + %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psllv_d + ; CHECK: vpsllvd %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psllv_d + ; CHECK: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) { + ; CHECK-LABEL: test_x86_avx512_psllv_q + ; CHECK: vpsllvq + %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psllv_q + ; CHECK: vpsllvq %zmm1, %zmm0, %zmm2 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psllv_q + ; CHECK: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone + + +define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1) { + ; CHECK-LABEL: test_x86_avx512_psrav_d + ; CHECK: vpsravd + %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrav_d + ; CHECK: vpsravd %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrav_d + ; CHECK: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) { + ; CHECK-LABEL: test_x86_avx512_psrav_q + ; CHECK: vpsravq + %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrav_q + ; CHECK: vpsravq %zmm1, %zmm0, %zmm2 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrav_q + ; CHECK: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone + +define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1) { + ; CHECK-LABEL: test_x86_avx512_psrlv_d + ; CHECK: vpsrlvd + %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrlv_d + ; CHECK: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d + ; CHECK: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) { + ; CHECK-LABEL: test_x86_avx512_psrlv_q + ; CHECK: vpsrlvq + %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_mask_psrlv_q + ; CHECK: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { + ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q + ; CHECK: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z} + %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone + +define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, <8 x i64>* %ptr) { + ; CHECK-LABEL: test_x86_avx512_psrlv_q_memop + ; CHECK: vpsrlvq (% + %b = load <8 x i64>* %ptr + %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} diff --git a/test/CodeGen/X86/avx512-logic.ll b/test/CodeGen/X86/avx512-logic.ll new file mode 100644 index 000000000000..bee4f52b3216 --- /dev/null +++ b/test/CodeGen/X86/avx512-logic.ll @@ -0,0 +1,101 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +; CHECK-LABEL: vpandd +; CHECK: vpandd %zmm +; CHECK: ret +define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, + i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %x = and <16 x i32> %a2, %b + ret <16 x i32> %x +} + +; CHECK-LABEL: vpord +; CHECK: vpord %zmm +; CHECK: ret +define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, + i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %x = or <16 x i32> %a2, %b + ret <16 x i32> %x +} + +; CHECK-LABEL: vpxord +; CHECK: vpxord %zmm +; CHECK: ret +define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, + i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %x = xor <16 x i32> %a2, %b + ret <16 x i32> %x +} + +; CHECK-LABEL: vpandq +; CHECK: vpandq %zmm +; CHECK: ret +define <8 x i64> @vpandq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> + %x = and <8 x i64> %a2, %b + ret <8 x i64> %x +} + +; CHECK-LABEL: vporq +; CHECK: vporq %zmm +; CHECK: ret +define <8 x i64> @vporq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> + %x = or <8 x i64> %a2, %b + ret <8 x i64> %x +} + +; CHECK-LABEL: vpxorq +; CHECK: vpxorq %zmm +; CHECK: ret +define <8 x i64> @vpxorq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> + %x = xor <8 x i64> %a2, %b + ret <8 x i64> %x +} + + +; CHECK-LABEL: orq_broadcast +; CHECK: vporq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0 +; CHECK: ret +define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind { + %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2> + ret <8 x i64> %b +} + +; CHECK-LABEL: andd512fold +; CHECK: vpandd (% +; CHECK: ret +define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { +entry: + %a = load <16 x i32>* %x, align 4 + %b = and <16 x i32> %y, %a + ret <16 x i32> %b +} + +; CHECK-LABEL: andqbrst +; CHECK: vpandq (%rdi){1to8}, %zmm +; CHECK: ret +define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) { +entry: + %a = load i64* %ap, align 8 + %b = insertelement <8 x i64> undef, i64 %a, i32 0 + %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer + %d = and <8 x i64> %p1, %c + ret <8 x i64>%d +} diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index dd33ffdb640d..35d334813fa8 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -1,12 +1,14 @@ -; RUN: llc < %s -march=x86-64 -mcpu=knl | FileCheck %s +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s define i16 @mask16(i16 %x) { %m0 = bitcast i16 %x to <16 x i1> %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> %ret = bitcast <16 x i1> %m1 to i16 ret i16 %ret -; CHECK: mask16 -; CHECK: knotw +; CHECK-LABEL: mask16 +; CHECK: kmovw +; CHECK-NEXT: knotw +; CHECK-NEXT: kmovw ; CHECK: ret } @@ -15,8 +17,38 @@ define i8 @mask8(i8 %x) { %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> %ret = bitcast <8 x i1> %m1 to i8 ret i8 %ret -; CHECK: mask8 -; CHECK: knotw +; CHECK-LABEL: mask8 +; CHECK: kmovw +; CHECK-NEXT: knotw +; CHECK-NEXT: kmovw +; CHECK: ret +} + +define void @mask16_mem(i16* %ptr) { + %x = load i16* %ptr, align 4 + %m0 = bitcast i16 %x to <16 x i1> + %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <16 x i1> %m1 to i16 + store i16 %ret, i16* %ptr, align 4 + ret void +; CHECK-LABEL: mask16_mem +; CHECK: kmovw ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}} +; CHECK-NEXT: knotw +; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]]) +; CHECK: ret +} + +define void @mask8_mem(i8* %ptr) { + %x = load i8* %ptr, align 4 + %m0 = bitcast i8 %x to <8 x i1> + %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <8 x i1> %m1 to i8 + store i8 %ret, i8* %ptr, align 4 + ret void +; CHECK-LABEL: mask8_mem +; CHECK: kmovw ([[ARG1]]), %k{{[0-7]}} +; CHECK-NEXT: knotw +; CHECK-NEXT: kmovw %k{{[0-7]}}, ([[ARG1]]) ; CHECK: ret } diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll index 009802f1742d..93875e839e22 100644 --- a/test/CodeGen/X86/avx512-mov.ll +++ b/test/CodeGen/X86/avx512-mov.ll @@ -153,31 +153,295 @@ define void @test18(i8 * %addr, <8 x i64> %data) { ret void } -; CHECK-LABEL: store_i1_1 -; CHECK: movb -; CHECK: movb +; CHECK-LABEL: test19 +; CHECK: vmovdqu32 +; CHECK: ret +define void @test19(i8 * %addr, <16 x i32> %data) { + %vaddr = bitcast i8* %addr to <16 x i32>* + store <16 x i32>%data, <16 x i32>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test20 +; CHECK: vmovdqa32 +; CHECK: ret +define void @test20(i8 * %addr, <16 x i32> %data) { + %vaddr = bitcast i8* %addr to <16 x i32>* + store <16 x i32>%data, <16 x i32>* %vaddr, align 64 + ret void +} + +; CHECK-LABEL: test21 +; CHECK: vmovdqa64 ; CHECK: ret -define void @store_i1_1() { - store i1 true, i1 addrspace(3)* undef, align 128 - store i1 false, i1 addrspace(2)* undef, align 128 +define <8 x i64> @test21(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x i64>* + %res = load <8 x i64>* %vaddr, align 64 + ret <8 x i64>%res +} + +; CHECK-LABEL: test22 +; CHECK: vmovdqu64 +; CHECK: ret +define void @test22(i8 * %addr, <8 x i64> %data) { + %vaddr = bitcast i8* %addr to <8 x i64>* + store <8 x i64>%data, <8 x i64>* %vaddr, align 1 ret void } -; CHECK-LABEL: store_i1_2 -; CHECK: movb +; CHECK-LABEL: test23 +; CHECK: vmovdqu64 ; CHECK: ret -define void @store_i1_2(i64 %a, i64 %b) { - %res = icmp eq i64 %a, %b - store i1 %res, i1 addrspace(3)* undef, align 128 +define <8 x i64> @test23(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x i64>* + %res = load <8 x i64>* %vaddr, align 1 + ret <8 x i64>%res +} + +; CHECK-LABEL: test24 +; CHECK: vmovapd +; CHECK: ret +define void @test24(i8 * %addr, <8 x double> %data) { + %vaddr = bitcast i8* %addr to <8 x double>* + store <8 x double>%data, <8 x double>* %vaddr, align 64 ret void } -; CHECK-LABEL: store_i1_3 -; CHECK: kmovw +; CHECK-LABEL: test25 +; CHECK: vmovapd +; CHECK: ret +define <8 x double> @test25(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x double>* + %res = load <8 x double>* %vaddr, align 64 + ret <8 x double>%res +} + +; CHECK-LABEL: test26 +; CHECK: vmovaps ; CHECK: ret -define void @store_i1_3(i16 %a) { - %a_vec = bitcast i16 %a to <16 x i1> - %res = extractelement <16 x i1> %a_vec, i32 4 - store i1 %res, i1 addrspace(3)* undef, align 128 +define void @test26(i8 * %addr, <16 x float> %data) { + %vaddr = bitcast i8* %addr to <16 x float>* + store <16 x float>%data, <16 x float>* %vaddr, align 64 ret void } + +; CHECK-LABEL: test27 +; CHECK: vmovaps +; CHECK: ret +define <16 x float> @test27(i8 * %addr) { + %vaddr = bitcast i8* %addr to <16 x float>* + %res = load <16 x float>* %vaddr, align 64 + ret <16 x float>%res +} + +; CHECK-LABEL: test28 +; CHECK: vmovupd +; CHECK: ret +define void @test28(i8 * %addr, <8 x double> %data) { + %vaddr = bitcast i8* %addr to <8 x double>* + store <8 x double>%data, <8 x double>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test29 +; CHECK: vmovupd +; CHECK: ret +define <8 x double> @test29(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x double>* + %res = load <8 x double>* %vaddr, align 1 + ret <8 x double>%res +} + +; CHECK-LABEL: test30 +; CHECK: vmovups +; CHECK: ret +define void @test30(i8 * %addr, <16 x float> %data) { + %vaddr = bitcast i8* %addr to <16 x float>* + store <16 x float>%data, <16 x float>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test31 +; CHECK: vmovups +; CHECK: ret +define <16 x float> @test31(i8 * %addr) { + %vaddr = bitcast i8* %addr to <16 x float>* + %res = load <16 x float>* %vaddr, align 1 + ret <16 x float>%res +} + +; CHECK-LABEL: test32 +; CHECK: vmovdqa32{{.*{%k[1-7]} }} +; CHECK: ret +define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old + ret <16 x i32>%res +} + +; CHECK-LABEL: test33 +; CHECK: vmovdqu32{{.*{%k[1-7]} }} +; CHECK: ret +define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old + ret <16 x i32>%res +} + +; CHECK-LABEL: test34 +; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer + ret <16 x i32>%res +} + +; CHECK-LABEL: test35 +; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i32>* + %r = load <16 x i32>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer + ret <16 x i32>%res +} + +; CHECK-LABEL: test36 +; CHECK: vmovdqa64{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old + ret <8 x i64>%res +} + +; CHECK-LABEL: test37 +; CHECK: vmovdqu64{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old + ret <8 x i64>%res +} + +; CHECK-LABEL: test38 +; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer + ret <8 x i64>%res +} + +; CHECK-LABEL: test39 +; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) { + %mask = icmp ne <8 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i64>* + %r = load <8 x i64>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer + ret <8 x i64>%res +} + +; CHECK-LABEL: test40 +; CHECK: vmovaps{{.*{%k[1-7]} }} +; CHECK: ret +define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old + ret <16 x float>%res +} + +; CHECK-LABEL: test41 +; CHECK: vmovups{{.*{%k[1-7]} }} +; CHECK: ret +define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old + ret <16 x float>%res +} + +; CHECK-LABEL: test42 +; CHECK: vmovaps{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) { + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>* %vaddr, align 64 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer + ret <16 x float>%res +} + +; CHECK-LABEL: test43 +; CHECK: vmovups{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) { + %mask = fcmp one <16 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x float>* + %r = load <16 x float>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer + ret <16 x float>%res +} + +; CHECK-LABEL: test44 +; CHECK: vmovapd{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old + ret <8 x double>%res +} + +; CHECK-LABEL: test45 +; CHECK: vmovupd{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old + ret <8 x double>%res +} + +; CHECK-LABEL: test46 +; CHECK: vmovapd{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) { + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>* %vaddr, align 64 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer + ret <8 x double>%res +} + +; CHECK-LABEL: test47 +; CHECK: vmovupd{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x double> @test47(i8 * %addr, <8 x double> %mask1) { + %mask = fcmp one <8 x double> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x double>* + %r = load <8 x double>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer + ret <8 x double>%res +} diff --git a/test/CodeGen/X86/avx512-nontemporal.ll b/test/CodeGen/X86/avx512-nontemporal.ll index ef50cdb82831..bf57d021acab 100644 --- a/test/CodeGen/X86/avx512-nontemporal.ll +++ b/test/CodeGen/X86/avx512-nontemporal.ll @@ -16,4 +16,4 @@ define void @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x ret void } -!0 = metadata !{i32 1} +!0 = !{i32 1} diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll index 83f46984781f..0dbf286d3c5d 100644 --- a/test/CodeGen/X86/avx512-select.ll +++ b/test/CodeGen/X86/avx512-select.ll @@ -39,3 +39,56 @@ define double @select03(double %a, double %b, double %c, double %eps) { %cond = select i1 %cmp, double %c, double %b ret double %cond } + +; CHECK-LABEL: @select04 +; CHECK: vmovaps %zmm3, %zmm1 +; CHECK-NEXT: ret +; PR20677 +define <16 x double> @select04(<16 x double> %a, <16 x double> %b) { + %sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b + ret <16 x double> %sel +} + +; CHECK-LABEL: select05 +; CHECK: kmovw %esi, %k0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +define i8 @select05(i8 %a.0, i8 %m) { + %mask = bitcast i8 %m to <8 x i1> + %a = bitcast i8 %a.0 to <8 x i1> + %r = select <8 x i1> %mask, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>, <8 x i1> %a + %res = bitcast <8 x i1> %r to i8 + ret i8 %res; +} + +; CHECK-LABEL: select06 +; CHECK: kmovw %esi, %k0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: kandw %k1, %k0, %k0 +; CHECK-NEXT: kmovw %k0, %eax +define i8 @select06(i8 %a.0, i8 %m) { + %mask = bitcast i8 %m to <8 x i1> + %a = bitcast i8 %a.0 to <8 x i1> + %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer + %res = bitcast <8 x i1> %r to i8 + ret i8 %res; +} + +; CHECK-LABEL: select07 +; CHECK-DAG: kmovw %edx, %k0 +; CHECK-DAG: kmovw %edi, %k1 +; CHECK-DAG: kmovw %esi, %k2 +; CHECK: kandw %k0, %k1, %k1 +; CHECK-NEXT: knotw %k0, %k0 +; CHECK-NEXT: kandw %k0, %k2, %k0 +; CHECK-NEXT: korw %k0, %k1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) { + %mask = bitcast i8 %m to <8 x i1> + %a = bitcast i8 %a.0 to <8 x i1> + %b = bitcast i8 %b.0 to <8 x i1> + %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> %b + %res = bitcast <8 x i1> %r to i8 + ret i8 %res; +} diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll deleted file mode 100644 index b99e89a9a546..000000000000 --- a/test/CodeGen/X86/avx512-shuffle.ll +++ /dev/null @@ -1,314 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s -; CHECK: LCP -; CHECK: .long 2 -; CHECK: .long 5 -; CHECK: .long 0 -; CHECK: .long 0 -; CHECK: .long 7 -; CHECK: .long 0 -; CHECK: .long 10 -; CHECK: .long 1 -; CHECK: .long 0 -; CHECK: .long 5 -; CHECK: .long 0 -; CHECK: .long 4 -; CHECK: .long 7 -; CHECK: .long 0 -; CHECK: .long 10 -; CHECK: .long 1 -; CHECK-LABEL: test1: -; CHECK: vpermps -; CHECK: ret -define <16 x float> @test1(<16 x float> %a) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1> - ret <16 x float> %c -} - -; CHECK-LABEL: test2: -; CHECK: vpermd -; CHECK: ret -define <16 x i32> @test2(<16 x i32> %a) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1> - ret <16 x i32> %c -} - -; CHECK-LABEL: test3: -; CHECK: vpermq -; CHECK: ret -define <8 x i64> @test3(<8 x i64> %a) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 5, i32 1, i32 undef, i32 7, i32 undef, i32 3, i32 1> - ret <8 x i64> %c -} - -; CHECK-LABEL: test4: -; CHECK: vpermpd -; CHECK: ret -define <8 x double> @test4(<8 x double> %a) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x double> %c -} - -; CHECK-LABEL: test5: -; CHECK: vpermt2pd -; CHECK: ret -define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> - ret <8 x double> %c -} - -; The reg variant of vpermt2 with a writemask -; CHECK-LABEL: test5m: -; CHECK: vpermt2pd {{.* {%k[1-7]} {z}}} -define <8 x double> @test5m(<8 x double> %a, <8 x double> %b, i8 %mask) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> - %m = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %m, <8 x double> %c, <8 x double> zeroinitializer - ret <8 x double> %res -} - -; CHECK-LABEL: test6: -; CHECK: vpermq $30 -; CHECK: ret -define <8 x i64> @test6(<8 x i64> %a) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4> - ret <8 x i64> %c -} - -; CHECK-LABEL: test7: -; CHECK: vpermt2q -; CHECK: ret -define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> - ret <8 x i64> %c -} - -; The reg variant of vpermt2 with a writemask -; CHECK-LABEL: test7m: -; CHECK: vpermt2q {{.* {%k[1-7]} {z}}} -define <8 x i64> @test7m(<8 x i64> %a, <8 x i64> %b, i8 %mask) nounwind { - %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> - %m = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer - ret <8 x i64> %res -} - -; The mem variant of vpermt2 with a writemask -; CHECK-LABEL: test7mm: -; CHECK: vpermt2q {{\(.*\).* {%k[1-7]} {z}}} -define <8 x i64> @test7mm(<8 x i64> %a, <8 x i64> *%pb, i8 %mask) nounwind { - %b = load <8 x i64>* %pb - %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> - %m = bitcast i8 %mask to <8 x i1> - %res = select <8 x i1> %m, <8 x i64> %c, <8 x i64> zeroinitializer - ret <8 x i64> %res -} - -; CHECK-LABEL: test8: -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - ret <16 x i32> %c -} - -; The reg variant of vpermt2 with a writemask -; CHECK-LABEL: test8m: -; CHECK: vpermt2d {{.* {%k[1-7]} {z}}} -define <16 x i32> @test8m(<16 x i32> %a, <16 x i32> %b, i16 %mask) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - %m = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer - ret <16 x i32> %res -} - -; The mem variant of vpermt2 with a writemask -; CHECK-LABEL: test8mm: -; CHECK: vpermt2d {{\(.*\).* {%k[1-7]} {z}}} -define <16 x i32> @test8mm(<16 x i32> %a, <16 x i32> *%pb, i16 %mask) nounwind { - %b = load <16 x i32> * %pb - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - %m = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %m, <16 x i32> %c, <16 x i32> zeroinitializer - ret <16 x i32> %res -} - -; CHECK-LABEL: test9: -; CHECK: vpermt2ps -; CHECK: ret -define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - ret <16 x float> %c -} - -; The reg variant of vpermt2 with a writemask -; CHECK-LABEL: test9m: -; CHECK: vpermt2ps {{.*}} {%k{{.}}} {z} -define <16 x float> @test9m(<16 x float> %a, <16 x float> %b, i16 %mask) nounwind { - %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - %m = bitcast i16 %mask to <16 x i1> - %res = select <16 x i1> %m, <16 x float> %c, <16 x float> zeroinitializer - ret <16 x float> %res -} - -; CHECK-LABEL: test10: -; CHECK: vpermt2ps ( -; CHECK: ret -define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind { - %c = load <16 x float>* %b - %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - ret <16 x float> %d -} - -; CHECK-LABEL: test11: -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind { - %c = load <16 x i32>* %b - %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> - ret <16 x i32> %d -} - -; CHECK-LABEL: test12 -; CHECK: vmovlhps {{.*}}## encoding: [0x62 -; CHECK: ret -define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) nounwind { - %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - ret <4 x i32> %c -} - -; CHECK-LABEL: test13 -; CHECK: vpermilps $-79, %zmm -; CHECK: ret -define <16 x float> @test13(<16 x float> %a) { - %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> - ret <16 x float> %b -} - -; CHECK-LABEL: test14 -; CHECK: vpermilpd $-53, %zmm -; CHECK: ret -define <8 x double> @test14(<8 x double> %a) { - %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7> - ret <8 x double> %b -} - -; CHECK-LABEL: test15 -; CHECK: vpshufd $-79, %zmm -; CHECK: ret -define <16 x i32> @test15(<16 x i32> %a) { - %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> - ret <16 x i32> %b -} -; CHECK-LABEL: test16 -; CHECK: valignq $2, %zmm0, %zmm1 -; CHECK: ret -define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> - ret <8 x double> %c -} - -; CHECK-LABEL: test17 -; CHECK: vshufpd $19, %zmm1, %zmm0 -; CHECK: ret -define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind { - %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef> - ret <8 x double> %c -} - -; CHECK-LABEL: test18 -; CHECK: vpunpckhdq %zmm -; CHECK: ret -define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) { - %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15, i32 18, i32 26, i32 19, i32 27, i32 22, i32 30, i32 23, i32 31> - ret <16 x i32> %b -} - -; CHECK-LABEL: test19 -; CHECK: vpunpckldq %zmm -; CHECK: ret -define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) { - %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29> - ret <16 x i32> %b -} - -; CHECK-LABEL: test20 -; CHECK: vpunpckhqdq %zmm -; CHECK: ret -define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) { - %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32><i32 1, i32 5, i32 3, i32 7, i32 9, i32 13, i32 11, i32 15> - ret <8 x i64> %b -} - -; CHECK-LABEL: test21 -; CHECK: vunpcklps %zmm -; CHECK: ret -define <16 x float> @test21(<16 x float> %a, <16 x float> %c) { - %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29> - ret <16 x float> %b -} - -; CHECK-LABEL: test22 -; CHECK: vmovhlps {{.*}}## encoding: [0x62 -; CHECK: ret -define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) nounwind { - %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7> - ret <4 x i32> %c -} - -; CHECK-LABEL: @test23 -; CHECK: vshufps $-112, %zmm -; CHECK: ret -define <16 x float> @test23(<16 x float> %a, <16 x float> %c) { - %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 0, i32 17, i32 18, i32 4, i32 4, i32 21, i32 22, i32 8, i32 8, i32 25, i32 26, i32 12, i32 12, i32 29, i32 30> - ret <16 x float> %b -} - -; CHECK-LABEL: @test24 -; CHECK: vpermt2d -; CHECK: ret -define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test25 -; CHECK: vshufps $52 -; CHECK: ret -define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 19, i32 undef, i32 4, i32 5, i32 23, i32 undef, i32 8, i32 9, i32 27, i32 undef, i32 12, i32 13, i32 undef, i32 undef> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test26 -; CHECK: vmovshdup -; CHECK: ret -define <16 x i32> @test26(<16 x i32> %a) nounwind { - %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef> - ret <16 x i32> %c -} - -; CHECK-LABEL: @test27 -; CHECK: ret -define <16 x i32> @test27(<4 x i32>%a) { - %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <16 x i32> %res -} - -; CHECK-LABEL: @test28 -; CHECK: vinserti64x4 $1 -; CHECK: ret -define <16 x i32> @test28(<16 x i32>%x, <16 x i32>%y) { - %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, - i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> - ret <16 x i32> %res -} - -; CHECK-LABEL: @test29 -; CHECK: vinserti64x4 $0 -; CHECK: ret -define <16 x i32> @test29(<16 x i32>%x, <16 x i32>%y) { - %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, - i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> - ret <16 x i32> %res -} - diff --git a/test/CodeGen/X86/avx512-trunc-ext.ll b/test/CodeGen/X86/avx512-trunc-ext.ll index 5e097be04cdf..91ef5d58f438 100644 --- a/test/CodeGen/X86/avx512-trunc-ext.ll +++ b/test/CodeGen/X86/avx512-trunc-ext.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s ; CHECK-LABEL: trunc_16x32_to_16x8 ; CHECK: vpmovdb @@ -118,6 +119,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { ; CHECK-LABEL: sext_8i1_8i32 ; CHECK: vpbroadcastq LCP{{.*}}(%rip), %zmm0 {%k1} {z} +; SKX: vpmovm2d ; CHECK: ret define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { %x = icmp slt <8 x i32> %a1, %a2 @@ -135,9 +137,8 @@ define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) { } ; CHECK-LABEL: trunc_i32_to_i1 -; CHECK: andl -; CHECK: kmov -; CHECK: kortest +; CHECK: testb +; CHECK: setne ; CKECK: orl ; CHECK: ret define i16 @trunc_i32_to_i1(i32 %a) { @@ -146,3 +147,30 @@ define i16 @trunc_i32_to_i1(i32 %a) { %res = bitcast <16 x i1> %maskv to i16 ret i16 %res } + +; CHECK-LABEL: sext_8i1_8i16 +; SKX: vpmovm2w +; CHECK: ret +define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind { + %x = icmp slt <8 x i32> %a1, %a2 + %y = sext <8 x i1> %x to <8 x i16> + ret <8 x i16> %y +} + +; CHECK-LABEL: sext_16i1_16i32 +; SKX: vpmovm2d +; CHECK: ret +define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind { + %x = icmp slt <16 x i32> %a1, %a2 + %y = sext <16 x i1> %x to <16 x i32> + ret <16 x i32> %y +} + +; CHECK-LABEL: sext_8i1_8i64 +; SKX: vpmovm2q +; CHECK: ret +define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind { + %x = icmp slt <8 x i32> %a1, %a2 + %y = sext <8 x i1> %x to <8 x i64> + ret <8 x i64> %y +} diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll index 9c6db11d8f45..5bb82338d087 100644 --- a/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/test/CodeGen/X86/avx512-vbroadcast.ll @@ -1,59 +1,192 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -;CHECK-LABEL: _inreg16xi32: -;CHECK: vpbroadcastd {{.*}}, %zmm -;CHECK: ret define <16 x i32> @_inreg16xi32(i32 %a) { +; CHECK-LABEL: _inreg16xi32: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastd %edi, %zmm0 +; CHECK-NEXT: retq %b = insertelement <16 x i32> undef, i32 %a, i32 0 %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32> %c } -;CHECK-LABEL: _inreg8xi64: -;CHECK: vpbroadcastq {{.*}}, %zmm -;CHECK: ret define <8 x i64> @_inreg8xi64(i64 %a) { +; CHECK-LABEL: _inreg8xi64: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 +; CHECK-NEXT: retq %b = insertelement <8 x i64> undef, i64 %a, i32 0 %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer ret <8 x i64> %c } -;CHECK-LABEL: _inreg16xfloat: -;CHECK: vbroadcastss {{.*}}, %zmm +;CHECK-LABEL: _ss16xfloat_v4 +;CHECK: vbroadcastss %xmm0, %zmm0 ;CHECK: ret +define <16 x float> @_ss16xfloat_v4(<4 x float> %a) { + %b = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> zeroinitializer + ret <16 x float> %b +} + define <16 x float> @_inreg16xfloat(float %a) { +; CHECK-LABEL: _inreg16xfloat: +; CHECK: ## BB#0: +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: retq %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %c } -;CHECK-LABEL: _inreg8xdouble: -;CHECK: vbroadcastsd {{.*}}, %zmm +;CHECK-LABEL: _ss16xfloat_mask: +;CHECK: vbroadcastss %xmm0, %zmm1 {%k1} +;CHECK: ret +define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %b = insertelement <16 x float> undef, float %a, i32 0 + %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer + %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> %i + ret <16 x float> %r +} + +;CHECK-LABEL: _ss16xfloat_maskz: +;CHECK: vbroadcastss %xmm0, %zmm0 {%k1} {z} ;CHECK: ret +define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) { + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %b = insertelement <16 x float> undef, float %a, i32 0 + %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer + %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> zeroinitializer + ret <16 x float> %r +} + +;CHECK-LABEL: _ss16xfloat_load: +;CHECK: vbroadcastss (%{{.*}}, %zmm +;CHECK: ret +define <16 x float> @_ss16xfloat_load(float* %a.ptr) { + %a = load float* %a.ptr + %b = insertelement <16 x float> undef, float %a, i32 0 + %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer + ret <16 x float> %c +} + +;CHECK-LABEL: _ss16xfloat_mask_load: +;CHECK: vbroadcastss (%rdi), %zmm0 {%k1} +;CHECK: ret +define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) { + %a = load float* %a.ptr + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %b = insertelement <16 x float> undef, float %a, i32 0 + %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer + %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> %i + ret <16 x float> %r +} + +;CHECK-LABEL: _ss16xfloat_maskz_load: +;CHECK: vbroadcastss (%rdi), %zmm0 {%k1} {z} +;CHECK: ret +define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) { + %a = load float* %a.ptr + %mask = icmp ne <16 x i32> %mask1, zeroinitializer + %b = insertelement <16 x float> undef, float %a, i32 0 + %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer + %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> zeroinitializer + ret <16 x float> %r +} + define <8 x double> @_inreg8xdouble(double %a) { +; CHECK-LABEL: _inreg8xdouble: +; CHECK: ## BB#0: +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: retq %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer ret <8 x double> %c } -;CHECK-LABEL: _xmm16xi32 -;CHECK: vpbroadcastd +;CHECK-LABEL: _sd8xdouble_mask: +;CHECK: vbroadcastsd %xmm0, %zmm1 {%k1} +;CHECK: ret +define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %b = insertelement <8 x double> undef, double %a, i32 0 + %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer + %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> %i + ret <8 x double> %r +} + +;CHECK-LABEL: _sd8xdouble_maskz: +;CHECK: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ;CHECK: ret +define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %b = insertelement <8 x double> undef, double %a, i32 0 + %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer + %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> zeroinitializer + ret <8 x double> %r +} + +;CHECK-LABEL: _sd8xdouble_load: +;CHECK: vbroadcastsd (%rdi), %zmm +;CHECK: ret +define <8 x double> @_sd8xdouble_load(double* %a.ptr) { + %a = load double* %a.ptr + %b = insertelement <8 x double> undef, double %a, i32 0 + %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer + ret <8 x double> %c +} + +;CHECK-LABEL: _sd8xdouble_mask_load: +;CHECK: vbroadcastsd (%rdi), %zmm0 {%k1} +;CHECK: ret +define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) { + %a = load double* %a.ptr + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %b = insertelement <8 x double> undef, double %a, i32 0 + %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer + %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> %i + ret <8 x double> %r +} + +define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) { +; CHECK-LABEL: _sd8xdouble_maskz_load: +; CHECK: vbroadcastsd (%rdi), %zmm0 {%k1} {z} +; CHECK: ret + %a = load double* %a.ptr + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %b = insertelement <8 x double> undef, double %a, i32 0 + %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer + %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> zeroinitializer + ret <8 x double> %r +} + define <16 x i32> @_xmm16xi32(<16 x i32> %a) { +; CHECK-LABEL: _xmm16xi32: +; CHECK: ## BB#0: +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 +; CHECK-NEXT: retq %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32> %b } -;CHECK-LABEL: _xmm16xfloat -;CHECK: vbroadcastss {{.*}}## encoding: [0x62 -;CHECK: ret define <16 x float> @_xmm16xfloat(<16 x float> %a) { +; CHECK-LABEL: _xmm16xfloat: +; CHECK: ## BB#0: +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: retq %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %b } define <16 x i32> @test_vbroadcast() { - ; CHECK: vpbroadcastd +; CHECK-LABEL: test_vbroadcast: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vcmpunordps %zmm0, %zmm0, %k1 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; CHECK-NEXT: knotw %k1, %k1 +; CHECK-NEXT: vmovdqu32 %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq entry: %0 = sext <16 x i1> zeroinitializer to <16 x i32> %1 = fcmp uno <16 x float> undef, zeroinitializer @@ -62,3 +195,108 @@ entry: ret <16 x i32> %3 } +; We implement the set1 intrinsics with vector initializers. Verify that the +; IR generated will produce broadcasts at the end. +define <8 x double> @test_set1_pd(double %d) #2 { +; CHECK-LABEL: test_set1_pd: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: retq +entry: + %vecinit.i = insertelement <8 x double> undef, double %d, i32 0 + %vecinit1.i = insertelement <8 x double> %vecinit.i, double %d, i32 1 + %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %d, i32 2 + %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %d, i32 3 + %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %d, i32 4 + %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %d, i32 5 + %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %d, i32 6 + %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %d, i32 7 + ret <8 x double> %vecinit7.i +} + +define <8 x i64> @test_set1_epi64(i64 %d) #2 { +; CHECK-LABEL: test_set1_epi64: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 +; CHECK-NEXT: retq +entry: + %vecinit.i = insertelement <8 x i64> undef, i64 %d, i32 0 + %vecinit1.i = insertelement <8 x i64> %vecinit.i, i64 %d, i32 1 + %vecinit2.i = insertelement <8 x i64> %vecinit1.i, i64 %d, i32 2 + %vecinit3.i = insertelement <8 x i64> %vecinit2.i, i64 %d, i32 3 + %vecinit4.i = insertelement <8 x i64> %vecinit3.i, i64 %d, i32 4 + %vecinit5.i = insertelement <8 x i64> %vecinit4.i, i64 %d, i32 5 + %vecinit6.i = insertelement <8 x i64> %vecinit5.i, i64 %d, i32 6 + %vecinit7.i = insertelement <8 x i64> %vecinit6.i, i64 %d, i32 7 + ret <8 x i64> %vecinit7.i +} + +define <16 x float> @test_set1_ps(float %f) #2 { +; CHECK-LABEL: test_set1_ps: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 +; CHECK-NEXT: retq +entry: + %vecinit.i = insertelement <16 x float> undef, float %f, i32 0 + %vecinit1.i = insertelement <16 x float> %vecinit.i, float %f, i32 1 + %vecinit2.i = insertelement <16 x float> %vecinit1.i, float %f, i32 2 + %vecinit3.i = insertelement <16 x float> %vecinit2.i, float %f, i32 3 + %vecinit4.i = insertelement <16 x float> %vecinit3.i, float %f, i32 4 + %vecinit5.i = insertelement <16 x float> %vecinit4.i, float %f, i32 5 + %vecinit6.i = insertelement <16 x float> %vecinit5.i, float %f, i32 6 + %vecinit7.i = insertelement <16 x float> %vecinit6.i, float %f, i32 7 + %vecinit8.i = insertelement <16 x float> %vecinit7.i, float %f, i32 8 + %vecinit9.i = insertelement <16 x float> %vecinit8.i, float %f, i32 9 + %vecinit10.i = insertelement <16 x float> %vecinit9.i, float %f, i32 10 + %vecinit11.i = insertelement <16 x float> %vecinit10.i, float %f, i32 11 + %vecinit12.i = insertelement <16 x float> %vecinit11.i, float %f, i32 12 + %vecinit13.i = insertelement <16 x float> %vecinit12.i, float %f, i32 13 + %vecinit14.i = insertelement <16 x float> %vecinit13.i, float %f, i32 14 + %vecinit15.i = insertelement <16 x float> %vecinit14.i, float %f, i32 15 + ret <16 x float> %vecinit15.i +} + +define <16 x i32> @test_set1_epi32(i32 %f) #2 { +; CHECK-LABEL: test_set1_epi32: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vpbroadcastd %edi, %zmm0 +; CHECK-NEXT: retq +entry: + %vecinit.i = insertelement <16 x i32> undef, i32 %f, i32 0 + %vecinit1.i = insertelement <16 x i32> %vecinit.i, i32 %f, i32 1 + %vecinit2.i = insertelement <16 x i32> %vecinit1.i, i32 %f, i32 2 + %vecinit3.i = insertelement <16 x i32> %vecinit2.i, i32 %f, i32 3 + %vecinit4.i = insertelement <16 x i32> %vecinit3.i, i32 %f, i32 4 + %vecinit5.i = insertelement <16 x i32> %vecinit4.i, i32 %f, i32 5 + %vecinit6.i = insertelement <16 x i32> %vecinit5.i, i32 %f, i32 6 + %vecinit7.i = insertelement <16 x i32> %vecinit6.i, i32 %f, i32 7 + %vecinit8.i = insertelement <16 x i32> %vecinit7.i, i32 %f, i32 8 + %vecinit9.i = insertelement <16 x i32> %vecinit8.i, i32 %f, i32 9 + %vecinit10.i = insertelement <16 x i32> %vecinit9.i, i32 %f, i32 10 + %vecinit11.i = insertelement <16 x i32> %vecinit10.i, i32 %f, i32 11 + %vecinit12.i = insertelement <16 x i32> %vecinit11.i, i32 %f, i32 12 + %vecinit13.i = insertelement <16 x i32> %vecinit12.i, i32 %f, i32 13 + %vecinit14.i = insertelement <16 x i32> %vecinit13.i, i32 %f, i32 14 + %vecinit15.i = insertelement <16 x i32> %vecinit14.i, i32 %f, i32 15 + ret <16 x i32> %vecinit15.i +} + +; We implement the scalar broadcast intrinsics with vector initializers. +; Verify that the IR generated will produce the broadcast at the end. +define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) { +; CHECK-LABEL: test_mm512_broadcastsd_pd: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 +; CHECK-NEXT: retq +entry: + %0 = extractelement <2 x double> %a, i32 0 + %vecinit.i = insertelement <8 x double> undef, double %0, i32 0 + %vecinit1.i = insertelement <8 x double> %vecinit.i, double %0, i32 1 + %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %0, i32 2 + %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %0, i32 3 + %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %0, i32 4 + %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %0, i32 5 + %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %0, i32 6 + %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %0, i32 7 + ret <8 x double> %vecinit7.i +} diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index d762f0083e35..b16f5c9663c6 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1,145 +1,176 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -; CHECK-LABEL: test1 -; CHECK: vcmpleps -; CHECK: vmovups -; CHECK: ret define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind { +; CHECK-LABEL: test1: +; CHECK: ## BB#0: +; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %mask = fcmp ole <16 x float> %x, %y %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y ret <16 x float> %max } -; CHECK-LABEL: test2 -; CHECK: vcmplepd -; CHECK: vmovupd -; CHECK: ret define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind { +; CHECK-LABEL: test2: +; CHECK: ## BB#0: +; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %mask = fcmp ole <8 x double> %x, %y %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y ret <8 x double> %max } -; CHECK-LABEL: test3 -; CHECK: vpcmpeqd (%rdi) -; CHECK: vmovdqu32 -; CHECK: ret define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind { +; CHECK-LABEL: test3: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %y = load <16 x i32>* %yp, align 4 %mask = icmp eq <16 x i32> %x, %y %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 ret <16 x i32> %max } -; CHECK-LABEL: @test4_unsigned -; CHECK: vpcmpnltud -; CHECK: vmovdqu32 -; CHECK: ret -define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y) nounwind { +define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind { +; CHECK-LABEL: test4_unsigned: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %mask = icmp uge <16 x i32> %x, %y - %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y + %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y ret <16 x i32> %max } -; CHECK-LABEL: test5 -; CHECK: vpcmpeqq {{.*}}%k1 -; CHECK: vmovdqu64 {{.*}}%k1 -; CHECK: ret define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind { +; CHECK-LABEL: test5: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %mask = icmp eq <8 x i64> %x, %y %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y ret <8 x i64> %max } -; CHECK-LABEL: test6_unsigned -; CHECK: vpcmpnleuq {{.*}}%k1 -; CHECK: vmovdqu64 {{.*}}%k1 -; CHECK: ret -define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y) nounwind { +define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) nounwind { +; CHECK-LABEL: test6_unsigned: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq %mask = icmp ugt <8 x i64> %x, %y - %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y + %max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y ret <8 x i64> %max } -; CHECK-LABEL: test7 -; CHECK: xor -; CHECK: vcmpltps -; CHECK: vblendvps -; CHECK: ret define <4 x float> @test7(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test7: +; CHECK: ## BB#0: +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpltps %xmm2, %xmm0, %xmm2 +; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %mask = fcmp olt <4 x float> %a, zeroinitializer %c = select <4 x i1>%mask, <4 x float>%a, <4 x float>%b ret <4 x float>%c } -; CHECK-LABEL: test8 -; CHECK: xor -; CHECK: vcmpltpd -; CHECK: vblendvpd -; CHECK: ret define <2 x double> @test8(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test8: +; CHECK: ## BB#0: +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2 +; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq %mask = fcmp olt <2 x double> %a, zeroinitializer %c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b ret <2 x double>%c } -; CHECK-LABEL: test9 -; CHECK: vpcmpeqd -; CHECK: vpblendmd -; CHECK: ret define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind { +; CHECK-LABEL: test9: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: YMM1<def> YMM1<kill> ZMM1<def> +; CHECK-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<def> +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<kill> +; CHECK-NEXT: retq %mask = icmp eq <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y ret <8 x i32> %max } -; CHECK-LABEL: test10 -; CHECK: vcmpeqps -; CHECK: vblendmps -; CHECK: ret define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind { +; CHECK-LABEL: test10: +; CHECK: ## BB#0: +; CHECK-NEXT: ## kill: YMM1<def> YMM1<kill> ZMM1<def> +; CHECK-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<def> +; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 +; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<kill> +; CHECK-NEXT: retq %mask = fcmp oeq <8 x float> %x, %y %max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y ret <8 x float> %max } -; CHECK-LABEL: test11_unsigned -; CHECK: vpmaxud -; CHECK: ret define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind { +; CHECK-LABEL: test11_unsigned: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq %mask = icmp ugt <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y ret <8 x i32> %max } -; CHECK-LABEL: test12 -; CHECK: vpcmpeqq %zmm2, %zmm0, [[LO:%k[0-7]]] -; CHECK: vpcmpeqq %zmm3, %zmm1, [[HI:%k[0-7]]] -; CHECK: kunpckbw [[LO]], [[HI]], {{%k[0-7]}} define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind { +; CHECK-LABEL: test12: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 +; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; CHECK-NEXT: kunpckbw %k0, %k1, %k0 +; CHECK-NEXT: kmovw %k0, %eax +; CHECK-NEXT: ## kill: AX<def> AX<kill> EAX<kill> +; CHECK-NEXT: retq %res = icmp eq <16 x i64> %a, %b %res1 = bitcast <16 x i1> %res to i16 ret i16 %res1 } -; CHECK-LABEL: test13 -; CHECK: vcmpeqps %zmm -; CHECK: vpbroadcastd -; CHECK: ret define <16 x i32> @test13(<16 x float>%a, <16 x float>%b) +; CHECK-LABEL: test13: +; CHECK: ## BB#0: +; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1 +; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; CHECK-NEXT: retq { %cmpvector_i = fcmp oeq <16 x float> %a, %b %conv = zext <16 x i1> %cmpvector_i to <16 x i32> ret <16 x i32> %conv } -; CHECK-LABEL: test14 -; CHECK: vpcmp -; CHECK-NOT: vpcmp -; CHECK: vmovdqu32 {{.*}}{%k1} {z} -; CHECK: ret define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) { +; CHECK-LABEL: test14: +; CHECK: ## BB#0: +; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm1 +; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 +; CHECK-NEXT: knotw %k0, %k0 +; CHECK-NEXT: knotw %k0, %k1 +; CHECK-NEXT: vmovdqu32 %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq %sub_r = sub <16 x i32> %a, %b %cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a %sext.i3.i = sext <16 x i1> %cmp.i2.i to <16 x i32> @@ -148,12 +179,15 @@ define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) { ret <16 x i32>%res } -; CHECK-LABEL: test15 -; CHECK: vpcmpgtq -; CHECK-NOT: vpcmp -; CHECK: vmovdqu64 {{.*}}{%k1} {z} -; CHECK: ret define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) { +; CHECK-LABEL: test15: +; CHECK: ## BB#0: +; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm1 +; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 +; CHECK-NEXT: knotw %k0, %k0 +; CHECK-NEXT: knotw %k0, %k1 +; CHECK-NEXT: vmovdqu64 %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq %sub_r = sub <8 x i64> %a, %b %cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a %sext.i3.i = sext <8 x i1> %cmp.i2.i to <8 x i64> @@ -162,3 +196,181 @@ define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) { ret <8 x i64>%res } +define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind { +; CHECK-LABEL: test16: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1 +; CHECK-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %mask = icmp sge <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y + ret <16 x i32> %max +} + +define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind { +; CHECK-LABEL: test17: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %y = load <16 x i32>* %y.ptr, align 4 + %mask = icmp sgt <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind { +; CHECK-LABEL: test18: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %y = load <16 x i32>* %y.ptr, align 4 + %mask = icmp sle <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind { +; CHECK-LABEL: test19: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %y = load <16 x i32>* %y.ptr, align 4 + %mask = icmp ule <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) nounwind { +; CHECK-LABEL: test20: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %mask1 = icmp eq <16 x i32> %x1, %y1 + %mask0 = icmp eq <16 x i32> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y + ret <16 x i32> %max +} + +define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) nounwind { +; CHECK-LABEL: test21: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k1 +; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mask1 = icmp sge <8 x i64> %x1, %y1 + %mask0 = icmp sle <8 x i64> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1 + ret <8 x i64> %max +} + +define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind { +; CHECK-LABEL: test22: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpgtq %zmm2, %zmm1, %k1 +; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %mask1 = icmp sgt <8 x i64> %x1, %y1 + %y = load <8 x i64>* %y.ptr, align 4 + %mask0 = icmp sgt <8 x i64> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1 + ret <8 x i64> %max +} + +define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind { +; CHECK-LABEL: test23: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 +; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %mask1 = icmp sge <16 x i32> %x1, %y1 + %y = load <16 x i32>* %y.ptr, align 4 + %mask0 = icmp ule <16 x i32> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind { +; CHECK-LABEL: test24: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0 + %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer + %mask = icmp eq <8 x i64> %x, %y + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1 + ret <8 x i64> %max +} + +define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind { +; CHECK-LABEL: test25: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1 +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer + %mask = icmp sle <16 x i32> %x, %y + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind { +; CHECK-LABEL: test26: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1 +; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1} +; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %mask1 = icmp sge <16 x i32> %x1, %y1 + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer + %mask0 = icmp sgt <16 x i32> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1 + ret <16 x i32> %max +} + +define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind { +; CHECK-LABEL: test27: +; CHECK: ## BB#0: +; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1 +; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq + %mask1 = icmp sge <8 x i64> %x1, %y1 + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0 + %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer + %mask0 = icmp sle <8 x i64> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %x1 + ret <8 x i64> %max +} diff --git a/test/CodeGen/X86/avx512-zext-load-crash.ll b/test/CodeGen/X86/avx512-zext-load-crash.ll deleted file mode 100644 index 07ded13a0e3c..000000000000 --- a/test/CodeGen/X86/avx512-zext-load-crash.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s - -define <8 x i16> @test_zext_load() { - ; CHECK: vmovq -entry: - %0 = load <2 x i16> ** undef, align 8 - %1 = getelementptr inbounds <2 x i16>* %0, i64 1 - %2 = load <2 x i16>* %0, align 1 - %3 = shufflevector <2 x i16> %2, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %4 = load <2 x i16>* %1, align 1 - %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %6 = shufflevector <8 x i16> %3, <8 x i16> %5, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %6 -} diff --git a/test/CodeGen/X86/avx512bw-arith.ll b/test/CodeGen/X86/avx512bw-arith.ll new file mode 100644 index 000000000000..94f68a2ddc28 --- /dev/null +++ b/test/CodeGen/X86/avx512bw-arith.ll @@ -0,0 +1,102 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw| FileCheck %s + +; CHECK-LABEL: vpaddb512_test +; CHECK: vpaddb %zmm{{.*}} +; CHECK: ret +define <64 x i8> @vpaddb512_test(<64 x i8> %i, <64 x i8> %j) nounwind readnone { + %x = add <64 x i8> %i, %j + ret <64 x i8> %x +} + +; CHECK-LABEL: vpaddb512_fold_test +; CHECK: vpaddb (%rdi), %zmm{{.*}} +; CHECK: ret +define <64 x i8> @vpaddb512_fold_test(<64 x i8> %i, <64 x i8>* %j) nounwind { + %tmp = load <64 x i8>* %j, align 4 + %x = add <64 x i8> %i, %tmp + ret <64 x i8> %x +} + +; CHECK-LABEL: vpaddw512_test +; CHECK: vpaddw %zmm{{.*}} +; CHECK: ret +define <32 x i16> @vpaddw512_test(<32 x i16> %i, <32 x i16> %j) nounwind readnone { + %x = add <32 x i16> %i, %j + ret <32 x i16> %x +} + +; CHECK-LABEL: vpaddw512_fold_test +; CHECK: vpaddw (%rdi), %zmm{{.*}} +; CHECK: ret +define <32 x i16> @vpaddw512_fold_test(<32 x i16> %i, <32 x i16>* %j) nounwind { + %tmp = load <32 x i16>* %j, align 4 + %x = add <32 x i16> %i, %tmp + ret <32 x i16> %x +} + +; CHECK-LABEL: vpaddw512_mask_test +; CHECK: vpaddw %zmm{{.*%k[1-7].*}} +; CHECK: ret +define <32 x i16> @vpaddw512_mask_test(<32 x i16> %i, <32 x i16> %j, <32 x i16> %mask1) nounwind readnone { + %mask = icmp ne <32 x i16> %mask1, zeroinitializer + %x = add <32 x i16> %i, %j + %r = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %i + ret <32 x i16> %r +} + +; CHECK-LABEL: vpaddw512_maskz_test +; CHECK: vpaddw %zmm{{.*{%k[1-7]} {z}.*}} +; CHECK: ret +define <32 x i16> @vpaddw512_maskz_test(<32 x i16> %i, <32 x i16> %j, <32 x i16> %mask1) nounwind readnone { + %mask = icmp ne <32 x i16> %mask1, zeroinitializer + %x = add <32 x i16> %i, %j + %r = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer + ret <32 x i16> %r +} + +; CHECK-LABEL: vpaddw512_mask_fold_test +; CHECK: vpaddw (%rdi), %zmm{{.*%k[1-7]}} +; CHECK: ret +define <32 x i16> @vpaddw512_mask_fold_test(<32 x i16> %i, <32 x i16>* %j.ptr, <32 x i16> %mask1) nounwind readnone { + %mask = icmp ne <32 x i16> %mask1, zeroinitializer + %j = load <32 x i16>* %j.ptr + %x = add <32 x i16> %i, %j + %r = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %i + ret <32 x i16> %r +} + +; CHECK-LABEL: vpaddw512_maskz_fold_test +; CHECK: vpaddw (%rdi), %zmm{{.*{%k[1-7]} {z}}} +; CHECK: ret +define <32 x i16> @vpaddw512_maskz_fold_test(<32 x i16> %i, <32 x i16>* %j.ptr, <32 x i16> %mask1) nounwind readnone { + %mask = icmp ne <32 x i16> %mask1, zeroinitializer + %j = load <32 x i16>* %j.ptr + %x = add <32 x i16> %i, %j + %r = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer + ret <32 x i16> %r +} + +; CHECK-LABEL: vpsubb512_test +; CHECK: vpsubb %zmm{{.*}} +; CHECK: ret +define <64 x i8> @vpsubb512_test(<64 x i8> %i, <64 x i8> %j) nounwind readnone { + %x = sub <64 x i8> %i, %j + ret <64 x i8> %x +} + +; CHECK-LABEL: vpsubw512_test +; CHECK: vpsubw %zmm{{.*}} +; CHECK: ret +define <32 x i16> @vpsubw512_test(<32 x i16> %i, <32 x i16> %j) nounwind readnone { + %x = sub <32 x i16> %i, %j + ret <32 x i16> %x +} + +; CHECK-LABEL: vpmullw512_test +; CHECK: vpmullw %zmm{{.*}} +; CHECK: ret +define <32 x i16> @vpmullw512_test(<32 x i16> %i, <32 x i16> %j) { + %x = mul <32 x i16> %i, %j + ret <32 x i16> %x +} + diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll new file mode 100644 index 000000000000..c807d222ce53 --- /dev/null +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -0,0 +1,353 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding| FileCheck %s + +define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) { +; CHECK-LABEL: test_pcmpeq_b +; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ## + %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) + ret i64 %res +} + +define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_b +; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ## + %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) + ret i64 %res +} + +declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64) + +define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_pcmpeq_w +; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ## + %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1) + ret i32 %res +} + +define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_w +; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ## + %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32) + +define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) { +; CHECK-LABEL: test_pcmpgt_b +; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 ## + %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1) + ret i64 %res +} + +define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_b +; CHECK: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} ## + %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) + ret i64 %res +} + +declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64) + +define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_pcmpgt_w +; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 ## + %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1) + ret i32 %res +} + +define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_w +; CHECK: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} ## + %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32) + +define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) { +; CHECK_LABEL: test_cmp_b_512 +; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ## + %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) + %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0 +; CHECK: vpcmpltb %zmm1, %zmm0, %k0 ## + %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1) + %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1 +; CHECK: vpcmpleb %zmm1, %zmm0, %k0 ## + %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1) + %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2 +; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 ## + %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1) + %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3 +; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 ## + %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1) + %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4 +; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 ## + %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1) + %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5 +; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 ## + %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1) + %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6 +; CHECK: vpcmpordb %zmm1, %zmm0, %k0 ## + %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1) + %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7 + ret <8 x i64> %vec7 +} + +define <8 x i64> @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { +; CHECK_LABEL: test_mask_cmp_b_512 +; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) + %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0 +; CHECK: vpcmpltb %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask) + %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1 +; CHECK: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask) + %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2 +; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask) + %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3 +; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask) + %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4 +; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask) + %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5 +; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask) + %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6 +; CHECK: vpcmpordb %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask) + %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7 + ret <8 x i64> %vec7 +} + +declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone + +define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) { +; CHECK_LABEL: test_ucmp_b_512 +; CHECK: vpcmpequb %zmm1, %zmm0, %k0 ## + %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) + %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0 +; CHECK: vpcmpltub %zmm1, %zmm0, %k0 ## + %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1) + %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1 +; CHECK: vpcmpleub %zmm1, %zmm0, %k0 ## + %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1) + %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2 +; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 ## + %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1) + %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3 +; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 ## + %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1) + %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4 +; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 ## + %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1) + %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5 +; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 ## + %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1) + %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6 +; CHECK: vpcmpordub %zmm1, %zmm0, %k0 ## + %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1) + %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7 + ret <8 x i64> %vec7 +} + +define <8 x i64> @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { +; CHECK_LABEL: test_mask_ucmp_b_512 +; CHECK: vpcmpequb %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) + %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0 +; CHECK: vpcmpltub %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask) + %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1 +; CHECK: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask) + %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2 +; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask) + %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3 +; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask) + %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4 +; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask) + %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5 +; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask) + %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6 +; CHECK: vpcmpordub %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask) + %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7 + ret <8 x i64> %vec7 +} + +declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone + +define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) { +; CHECK_LABEL: test_cmp_w_512 +; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ## + %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltw %zmm1, %zmm0, %k0 ## + %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmplew %zmm1, %zmm0, %k0 ## + %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 ## + %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 ## + %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 ## + %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 ## + %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmpordw %zmm1, %zmm0, %k0 ## + %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +define <8 x i32> @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { +; CHECK_LABEL: test_mask_cmp_w_512 +; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltw %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmplew %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmpordw %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone + +define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) { +; CHECK_LABEL: test_ucmp_w_512 +; CHECK: vpcmpequw %zmm1, %zmm0, %k0 ## + %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 ## + %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 ## + %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 ## + %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 ## + %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 ## + %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 ## + %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmporduw %zmm1, %zmm0, %k0 ## + %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +define <8 x i32> @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) { +; CHECK_LABEL: test_mask_ucmp_w_512 +; CHECK: vpcmpequw %zmm1, %zmm0, %k0 {%k1} ## + %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} ## + %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ## + %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} ## + %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} ## + %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ## + %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ## + %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmporduw %zmm1, %zmm0, %k0 {%k1} ## + %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone + +; CHECK-LABEL: test_x86_mask_blend_b_256 +; CHECK: vpblendmb +define <32 x i8> @test_x86_mask_blend_b_256(i32 %a0, <32 x i8> %a1, <32 x i8> %a2) { + %res = call <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8> %a1, <32 x i8> %a2, i32 %a0) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx512.mask.blend.b.256(<32 x i8>, <32 x i8>, i32) nounwind readonly + +; CHECK-LABEL: test_x86_mask_blend_w_256 +define <16 x i16> @test_x86_mask_blend_w_256(i16 %mask, <16 x i16> %a1, <16 x i16> %a2) { + ; CHECK: vpblendmw + %res = call <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16> %a1, <16 x i16> %a2, i16 %mask) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx512.mask.blend.w.256(<16 x i16>, <16 x i16>, i16) nounwind readonly + +; CHECK-LABEL: test_x86_mask_blend_b_512 +; CHECK: vpblendmb +define <64 x i8> @test_x86_mask_blend_b_512(i64 %a0, <64 x i8> %a1, <64 x i8> %a2) { + %res = call <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8> %a1, <64 x i8> %a2, i64 %a0) ; <<64 x i8>> [#uses=1] + ret <64 x i8> %res +} +declare <64 x i8> @llvm.x86.avx512.mask.blend.b.512(<64 x i8>, <64 x i8>, i64) nounwind readonly + +; CHECK-LABEL: test_x86_mask_blend_w_512 +define <32 x i16> @test_x86_mask_blend_w_512(i32 %mask, <32 x i16> %a1, <32 x i16> %a2) { + ; CHECK: vpblendmw + %res = call <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16> %a1, <32 x i16> %a2, i32 %mask) ; <<32 x i16>> [#uses=1] + ret <32 x i16> %res +} +declare <32 x i16> @llvm.x86.avx512.mask.blend.w.512(<32 x i16>, <32 x i16>, i32) nounwind readonly + +; CHECK-LABEL: test_x86_mask_blend_b_128 +; CHECK: vpblendmb +define <16 x i8> @test_x86_mask_blend_b_128(i16 %a0, <16 x i8> %a1, <16 x i8> %a2) { + %res = call <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8> %a1, <16 x i8> %a2, i16 %a0) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.avx512.mask.blend.b.128(<16 x i8>, <16 x i8>, i16) nounwind readonly + +; CHECK-LABEL: test_x86_mask_blend_w_128 +define <8 x i16> @test_x86_mask_blend_w_128(i8 %mask, <8 x i16> %a1, <8 x i16> %a2) { + ; CHECK: vpblendmw + %res = call <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16> %a1, <8 x i16> %a2, i8 %mask) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.avx512.mask.blend.w.128(<8 x i16>, <8 x i16>, i8) nounwind readonly diff --git a/test/CodeGen/X86/avx512bw-mask-op.ll b/test/CodeGen/X86/avx512bw-mask-op.ll new file mode 100644 index 000000000000..9d7630c5d0ad --- /dev/null +++ b/test/CodeGen/X86/avx512bw-mask-op.ll @@ -0,0 +1,99 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s + +define i32 @mask32(i32 %x) { + %m0 = bitcast i32 %x to <32 x i1> + %m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <32 x i1> %m1 to i32 + ret i32 %ret +; CHECK-LABEL: mask32 +; CHECK: kmovd +; CHECK-NEXT: knotd +; CHECK-NEXT: kmovd +; CHECK_NEXT: ret +} + +define i64 @mask64(i64 %x) { + %m0 = bitcast i64 %x to <64 x i1> + %m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <64 x i1> %m1 to i64 + ret i64 %ret +; CHECK-LABEL: mask64 +; CHECK: kmovq +; CHECK-NEXT: knotq +; CHECK-NEXT: kmovq +; CHECK_NEXT: ret +} + +define void @mask32_mem(i32* %ptr) { + %x = load i32* %ptr, align 4 + %m0 = bitcast i32 %x to <32 x i1> + %m1 = xor <32 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <32 x i1> %m1 to i32 + store i32 %ret, i32* %ptr, align 4 + ret void +; CHECK-LABEL: mask32_mem +; CHECK: kmovd ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}} +; CHECK-NEXT: knotd +; CHECK-NEXT: kmovd %k{{[0-7]}}, ([[ARG1]]) +; CHECK_NEXT: ret +} + +define void @mask64_mem(i64* %ptr) { + %x = load i64* %ptr, align 4 + %m0 = bitcast i64 %x to <64 x i1> + %m1 = xor <64 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, + i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <64 x i1> %m1 to i64 + store i64 %ret, i64* %ptr, align 4 + ret void +; CHECK-LABEL: mask64_mem +; CHECK: kmovq ([[ARG1]]), %k{{[0-7]}} +; CHECK-NEXT: knotq +; CHECK-NEXT: kmovq %k{{[0-7]}}, ([[ARG1]]) +; CHECK_NEXT: ret +} + +define i32 @mand32(i32 %x, i32 %y) { + %ma = bitcast i32 %x to <32 x i1> + %mb = bitcast i32 %y to <32 x i1> + %mc = and <32 x i1> %ma, %mb + %md = xor <32 x i1> %ma, %mb + %me = or <32 x i1> %mc, %md + %ret = bitcast <32 x i1> %me to i32 +; CHECK: kandd +; CHECK: kxord +; CHECK: kord + ret i32 %ret +} + +define i64 @mand64(i64 %x, i64 %y) { + %ma = bitcast i64 %x to <64 x i1> + %mb = bitcast i64 %y to <64 x i1> + %mc = and <64 x i1> %ma, %mb + %md = xor <64 x i1> %ma, %mb + %me = or <64 x i1> %mc, %md + %ret = bitcast <64 x i1> %me to i64 +; CHECK: kandq +; CHECK: kxorq +; CHECK: korq + ret i64 %ret +} diff --git a/test/CodeGen/X86/avx512bw-mov.ll b/test/CodeGen/X86/avx512bw-mov.ll new file mode 100644 index 000000000000..2ff6d280ab8f --- /dev/null +++ b/test/CodeGen/X86/avx512bw-mov.ll @@ -0,0 +1,81 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s + +; CHECK-LABEL: test1 +; CHECK: vmovdqu8 +; CHECK: ret +define <64 x i8> @test1(i8 * %addr) { + %vaddr = bitcast i8* %addr to <64 x i8>* + %res = load <64 x i8>* %vaddr, align 1 + ret <64 x i8>%res +} + +; CHECK-LABEL: test2 +; CHECK: vmovdqu8 +; CHECK: ret +define void @test2(i8 * %addr, <64 x i8> %data) { + %vaddr = bitcast i8* %addr to <64 x i8>* + store <64 x i8>%data, <64 x i8>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test3 +; CHECK: vmovdqu8{{.*{%k[1-7]}}} +; CHECK: ret +define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) { + %mask = icmp ne <64 x i8> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <64 x i8>* + %r = load <64 x i8>* %vaddr, align 1 + %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> %old + ret <64 x i8>%res +} + +; CHECK-LABEL: test4 +; CHECK: vmovdqu8{{.*{%k[1-7]} {z}}} +; CHECK: ret +define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) { + %mask = icmp ne <64 x i8> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <64 x i8>* + %r = load <64 x i8>* %vaddr, align 1 + %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> zeroinitializer + ret <64 x i8>%res +} + +; CHECK-LABEL: test5 +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test5(i8 * %addr) { + %vaddr = bitcast i8* %addr to <32 x i16>* + %res = load <32 x i16>* %vaddr, align 1 + ret <32 x i16>%res +} + +; CHECK-LABEL: test6 +; CHECK: vmovdqu16 +; CHECK: ret +define void @test6(i8 * %addr, <32 x i16> %data) { + %vaddr = bitcast i8* %addr to <32 x i16>* + store <32 x i16>%data, <32 x i16>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test7 +; CHECK: vmovdqu16{{.*{%k[1-7]}}} +; CHECK: ret +define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) { + %mask = icmp ne <32 x i16> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <32 x i16>* + %r = load <32 x i16>* %vaddr, align 1 + %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> %old + ret <32 x i16>%res +} + +; CHECK-LABEL: test8 +; CHECK: vmovdqu16{{.*{%k[1-7]} {z}}} +; CHECK: ret +define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) { + %mask = icmp ne <32 x i16> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <32 x i16>* + %r = load <32 x i16>* %vaddr, align 1 + %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> zeroinitializer + ret <32 x i16>%res +} diff --git a/test/CodeGen/X86/avx512bw-vec-cmp.ll b/test/CodeGen/X86/avx512bw-vec-cmp.ll new file mode 100644 index 000000000000..6ba4db68662e --- /dev/null +++ b/test/CodeGen/X86/avx512bw-vec-cmp.ll @@ -0,0 +1,135 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s + +; CHECK-LABEL: test1 +; CHECK: vpcmpeqb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind { + %mask = icmp eq <64 x i8> %x, %y + %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y + ret <64 x i8> %max +} + +; CHECK-LABEL: test2 +; CHECK: vpcmpgtb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind { + %mask = icmp sgt <64 x i8> %x, %y + %max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y + ret <64 x i8> %max +} + +; CHECK-LABEL: @test3 +; CHECK: vpcmplew {{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind { + %mask = icmp sge <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y + ret <32 x i16> %max +} + +; CHECK-LABEL: test4 +; CHECK: vpcmpnleub {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind { + %mask = icmp ugt <64 x i8> %x, %y + %max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y + ret <64 x i8> %max +} + +; CHECK-LABEL: test5 +; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwind { + %y = load <32 x i16>* %yp, align 4 + %mask = icmp eq <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} + +; CHECK-LABEL: @test6 +; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind { + %y = load <32 x i16>* %y.ptr, align 4 + %mask = icmp sgt <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} + +; CHECK-LABEL: @test7 +; CHECK: vpcmplew (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind { + %y = load <32 x i16>* %y.ptr, align 4 + %mask = icmp sle <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} + +; CHECK-LABEL: @test8 +; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind { + %y = load <32 x i16>* %y.ptr, align 4 + %mask = icmp ule <32 x i16> %x, %y + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} + +; CHECK-LABEL: @test9 +; CHECK: vpcmpeqw %zmm{{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16> %y1) nounwind { + %mask1 = icmp eq <32 x i16> %x1, %y1 + %mask0 = icmp eq <32 x i16> %x, %y + %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %y + ret <32 x i16> %max +} + +; CHECK-LABEL: @test10 +; CHECK: vpcmpleb %zmm{{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y1) nounwind { + %mask1 = icmp sge <64 x i8> %x1, %y1 + %mask0 = icmp sle <64 x i8> %x, %y + %mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer + %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %x1 + ret <64 x i8> %max +} + +; CHECK-LABEL: @test11 +; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i8> %y1) nounwind { + %mask1 = icmp sgt <64 x i8> %x1, %y1 + %y = load <64 x i8>* %y.ptr, align 4 + %mask0 = icmp sgt <64 x i8> %x, %y + %mask = select <64 x i1> %mask0, <64 x i1> %mask1, <64 x i1> zeroinitializer + %max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %x1 + ret <64 x i8> %max +} + +; CHECK-LABEL: @test12 +; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <32 x i16> @test12(<32 x i16> %x, <32 x i16>* %y.ptr, <32 x i16> %x1, <32 x i16> %y1) nounwind { + %mask1 = icmp sge <32 x i16> %x1, %y1 + %y = load <32 x i16>* %y.ptr, align 4 + %mask0 = icmp ule <32 x i16> %x, %y + %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer + %max = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> %x1 + ret <32 x i16> %max +} diff --git a/test/CodeGen/X86/avx512bwvl-arith.ll b/test/CodeGen/X86/avx512bwvl-arith.ll new file mode 100644 index 000000000000..96f01409f5be --- /dev/null +++ b/test/CodeGen/X86/avx512bwvl-arith.ll @@ -0,0 +1,206 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl| FileCheck %s + +; 256-bit + +; CHECK-LABEL: vpaddb256_test +; CHECK: vpaddb %ymm{{.*}} +; CHECK: ret +define <32 x i8> @vpaddb256_test(<32 x i8> %i, <32 x i8> %j) nounwind readnone { + %x = add <32 x i8> %i, %j + ret <32 x i8> %x +} + +; CHECK-LABEL: vpaddb256_fold_test +; CHECK: vpaddb (%rdi), %ymm{{.*}} +; CHECK: ret +define <32 x i8> @vpaddb256_fold_test(<32 x i8> %i, <32 x i8>* %j) nounwind { + %tmp = load <32 x i8>* %j, align 4 + %x = add <32 x i8> %i, %tmp + ret <32 x i8> %x +} + +; CHECK-LABEL: vpaddw256_test +; CHECK: vpaddw %ymm{{.*}} +; CHECK: ret +define <16 x i16> @vpaddw256_test(<16 x i16> %i, <16 x i16> %j) nounwind readnone { + %x = add <16 x i16> %i, %j + ret <16 x i16> %x +} + +; CHECK-LABEL: vpaddw256_fold_test +; CHECK: vpaddw (%rdi), %ymm{{.*}} +; CHECK: ret +define <16 x i16> @vpaddw256_fold_test(<16 x i16> %i, <16 x i16>* %j) nounwind { + %tmp = load <16 x i16>* %j, align 4 + %x = add <16 x i16> %i, %tmp + ret <16 x i16> %x +} + +; CHECK-LABEL: vpaddw256_mask_test +; CHECK: vpaddw %ymm{{.*%k[1-7].*}} +; CHECK: ret +define <16 x i16> @vpaddw256_mask_test(<16 x i16> %i, <16 x i16> %j, <16 x i16> %mask1) nounwind readnone { + %mask = icmp ne <16 x i16> %mask1, zeroinitializer + %x = add <16 x i16> %i, %j + %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %i + ret <16 x i16> %r +} + +; CHECK-LABEL: vpaddw256_maskz_test +; CHECK: vpaddw %ymm{{.*{%k[1-7]} {z}.*}} +; CHECK: ret +define <16 x i16> @vpaddw256_maskz_test(<16 x i16> %i, <16 x i16> %j, <16 x i16> %mask1) nounwind readnone { + %mask = icmp ne <16 x i16> %mask1, zeroinitializer + %x = add <16 x i16> %i, %j + %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer + ret <16 x i16> %r +} + +; CHECK-LABEL: vpaddw256_mask_fold_test +; CHECK: vpaddw (%rdi), %ymm{{.*%k[1-7]}} +; CHECK: ret +define <16 x i16> @vpaddw256_mask_fold_test(<16 x i16> %i, <16 x i16>* %j.ptr, <16 x i16> %mask1) nounwind readnone { + %mask = icmp ne <16 x i16> %mask1, zeroinitializer + %j = load <16 x i16>* %j.ptr + %x = add <16 x i16> %i, %j + %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %i + ret <16 x i16> %r +} + +; CHECK-LABEL: vpaddw256_maskz_fold_test +; CHECK: vpaddw (%rdi), %ymm{{.*{%k[1-7]} {z}}} +; CHECK: ret +define <16 x i16> @vpaddw256_maskz_fold_test(<16 x i16> %i, <16 x i16>* %j.ptr, <16 x i16> %mask1) nounwind readnone { + %mask = icmp ne <16 x i16> %mask1, zeroinitializer + %j = load <16 x i16>* %j.ptr + %x = add <16 x i16> %i, %j + %r = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer + ret <16 x i16> %r +} + +; CHECK-LABEL: vpsubb256_test +; CHECK: vpsubb %ymm{{.*}} +; CHECK: ret +define <32 x i8> @vpsubb256_test(<32 x i8> %i, <32 x i8> %j) nounwind readnone { + %x = sub <32 x i8> %i, %j + ret <32 x i8> %x +} + +; CHECK-LABEL: vpsubw256_test +; CHECK: vpsubw %ymm{{.*}} +; CHECK: ret +define <16 x i16> @vpsubw256_test(<16 x i16> %i, <16 x i16> %j) nounwind readnone { + %x = sub <16 x i16> %i, %j + ret <16 x i16> %x +} + +; CHECK-LABEL: vpmullw256_test +; CHECK: vpmullw %ymm{{.*}} +; CHECK: ret +define <16 x i16> @vpmullw256_test(<16 x i16> %i, <16 x i16> %j) { + %x = mul <16 x i16> %i, %j + ret <16 x i16> %x +} + +; 128-bit + +; CHECK-LABEL: vpaddb128_test +; CHECK: vpaddb %xmm{{.*}} +; CHECK: ret +define <16 x i8> @vpaddb128_test(<16 x i8> %i, <16 x i8> %j) nounwind readnone { + %x = add <16 x i8> %i, %j + ret <16 x i8> %x +} + +; CHECK-LABEL: vpaddb128_fold_test +; CHECK: vpaddb (%rdi), %xmm{{.*}} +; CHECK: ret +define <16 x i8> @vpaddb128_fold_test(<16 x i8> %i, <16 x i8>* %j) nounwind { + %tmp = load <16 x i8>* %j, align 4 + %x = add <16 x i8> %i, %tmp + ret <16 x i8> %x +} + +; CHECK-LABEL: vpaddw128_test +; CHECK: vpaddw %xmm{{.*}} +; CHECK: ret +define <8 x i16> @vpaddw128_test(<8 x i16> %i, <8 x i16> %j) nounwind readnone { + %x = add <8 x i16> %i, %j + ret <8 x i16> %x +} + +; CHECK-LABEL: vpaddw128_fold_test +; CHECK: vpaddw (%rdi), %xmm{{.*}} +; CHECK: ret +define <8 x i16> @vpaddw128_fold_test(<8 x i16> %i, <8 x i16>* %j) nounwind { + %tmp = load <8 x i16>* %j, align 4 + %x = add <8 x i16> %i, %tmp + ret <8 x i16> %x +} + +; CHECK-LABEL: vpaddw128_mask_test +; CHECK: vpaddw %xmm{{.*%k[1-7].*}} +; CHECK: ret +define <8 x i16> @vpaddw128_mask_test(<8 x i16> %i, <8 x i16> %j, <8 x i16> %mask1) nounwind readnone { + %mask = icmp ne <8 x i16> %mask1, zeroinitializer + %x = add <8 x i16> %i, %j + %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %i + ret <8 x i16> %r +} + +; CHECK-LABEL: vpaddw128_maskz_test +; CHECK: vpaddw %xmm{{.*{%k[1-7]} {z}.*}} +; CHECK: ret +define <8 x i16> @vpaddw128_maskz_test(<8 x i16> %i, <8 x i16> %j, <8 x i16> %mask1) nounwind readnone { + %mask = icmp ne <8 x i16> %mask1, zeroinitializer + %x = add <8 x i16> %i, %j + %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer + ret <8 x i16> %r +} + +; CHECK-LABEL: vpaddw128_mask_fold_test +; CHECK: vpaddw (%rdi), %xmm{{.*%k[1-7]}} +; CHECK: ret +define <8 x i16> @vpaddw128_mask_fold_test(<8 x i16> %i, <8 x i16>* %j.ptr, <8 x i16> %mask1) nounwind readnone { + %mask = icmp ne <8 x i16> %mask1, zeroinitializer + %j = load <8 x i16>* %j.ptr + %x = add <8 x i16> %i, %j + %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %i + ret <8 x i16> %r +} + +; CHECK-LABEL: vpaddw128_maskz_fold_test +; CHECK: vpaddw (%rdi), %xmm{{.*{%k[1-7]} {z}}} +; CHECK: ret +define <8 x i16> @vpaddw128_maskz_fold_test(<8 x i16> %i, <8 x i16>* %j.ptr, <8 x i16> %mask1) nounwind readnone { + %mask = icmp ne <8 x i16> %mask1, zeroinitializer + %j = load <8 x i16>* %j.ptr + %x = add <8 x i16> %i, %j + %r = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer + ret <8 x i16> %r +} + +; CHECK-LABEL: vpsubb128_test +; CHECK: vpsubb %xmm{{.*}} +; CHECK: ret +define <16 x i8> @vpsubb128_test(<16 x i8> %i, <16 x i8> %j) nounwind readnone { + %x = sub <16 x i8> %i, %j + ret <16 x i8> %x +} + +; CHECK-LABEL: vpsubw128_test +; CHECK: vpsubw %xmm{{.*}} +; CHECK: ret +define <8 x i16> @vpsubw128_test(<8 x i16> %i, <8 x i16> %j) nounwind readnone { + %x = sub <8 x i16> %i, %j + ret <8 x i16> %x +} + +; CHECK-LABEL: vpmullw128_test +; CHECK: vpmullw %xmm{{.*}} +; CHECK: ret +define <8 x i16> @vpmullw128_test(<8 x i16> %i, <8 x i16> %j) { + %x = mul <8 x i16> %i, %j + ret <8 x i16> %x +} + diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll new file mode 100644 index 000000000000..678f252dea42 --- /dev/null +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -0,0 +1,998 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s + +; 256-bit + +define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_pcmpeq_b_256 +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ## + %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1) + ret i32 %res +} + +define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_b_256 +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## + %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32) + +define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_pcmpeq_w_256 +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_w_256 +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16) + +define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_pcmpgt_b_256 +; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 ## + %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1) + ret i32 %res +} + +define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_b_256 +; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## + %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) + ret i32 %res +} + +declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32) + +define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_pcmpgt_w_256 +; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_w_256 +; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16) + +define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { +; CHECK_LABEL: test_cmp_b_256 +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ## + %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ## + %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ## + %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ## + %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ## + %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ## + %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ## + %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ## + %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { +; CHECK_LABEL: test_mask_cmp_b_256 +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone + +define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { +; CHECK_LABEL: test_ucmp_b_256 +; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ## + %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ## + %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ## + %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ## + %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ## + %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ## + %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ## + %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ## + %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { +; CHECK_LABEL: test_mask_ucmp_b_256 +; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 +; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask) + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 +; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask) + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 +; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask) + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 +; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask) + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 +; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask) + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 +; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask) + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 +; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask) + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 + ret <8 x i32> %vec7 +} + +declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone + +define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK_LABEL: test_cmp_w_256 +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ## + %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ## + %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmplew %ymm1, %ymm0, %k0 ## + %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ## + %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ## + %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ## + %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ## + %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ## + %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) { +; CHECK_LABEL: test_mask_cmp_w_256 +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone + +define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK_LABEL: test_ucmp_w_256 +; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ## + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ## + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ## + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ## + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ## + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ## + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ## + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ## + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) { +; CHECK_LABEL: test_mask_ucmp_w_256 +; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone + +; 128-bit + +define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_pcmpeq_b_128 +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_b_128 +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16) + +define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_pcmpeq_w_128 +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_w_128 +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8) + +define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_pcmpgt_b_128 +; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 ## + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1) + ret i16 %res +} + +define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_b_128 +; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ## + %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask) + ret i16 %res +} + +declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16) + +define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_pcmpgt_w_128 +; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_w_128 +; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8) + +define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK_LABEL: test_cmp_b_128 +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ## + %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ## + %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ## + %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ## + %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ## + %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ## + %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ## + %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ## + %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { +; CHECK_LABEL: test_mask_cmp_b_128 +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone + +define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK_LABEL: test_ucmp_b_128 +; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ## + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ## + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ## + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ## + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ## + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ## + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ## + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ## + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { +; CHECK_LABEL: test_mask_ucmp_b_128 +; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 +; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask) + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 +; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask) + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 +; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask) + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 +; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask) + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 +; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask) + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 +; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask) + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 +; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask) + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 + ret <8 x i16> %vec7 +} + +declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone + +define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK_LABEL: test_cmp_w_128 +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmplew %xmm1, %xmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_cmp_w_128 +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone + +define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK_LABEL: test_ucmp_w_128 +; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { +; CHECK_LABEL: test_mask_ucmp_w_128 +; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone + +declare <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd256_ps + ; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps + ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) + +define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmadd256_pd: +; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) + +define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmadd128_pd: +; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub256_ps + ; CHECK: vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xaa,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub128_ps + ; CHECK: vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xaa,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub256_pd + ; CHECK: vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xaa,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsub128_pd + ; CHECK: vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xaa,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd256_ps + ; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd128_ps + ; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd256_pd + ; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmadd128_pd + ; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub256_ps + ; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub128_ps + ; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub256_pd + ; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfnmsub128_pd + ; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmaddsub256_ps: +; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { +; CHECK-LABEL: test_mask_fmaddsub128_ps: +; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmaddsub256_pd + ; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +declare <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmaddsub128_pd + ; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +declare <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + +define <8 x float> @test_mask_vfmsubadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd256_ps + ; CHECK: vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa7,0xc2] + %res = call <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <4 x float> @test_mask_vfmsubadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd128_ps + ; CHECK: vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa7,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +declare <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x double> @test_mask_vfmsubadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd256_pd + ; CHECK: vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa7,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} +declare <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <2 x double> @test_mask_vfmsubadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd128_pd + ; CHECK: vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_mask_vfmsubadd128rm_pd(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubadd128rm_pd + ; CHECK: vfmsubadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0x07] + %a2 = load <2 x double>* %ptr_a2 + %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} +declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone +define <8 x double> @test_mask_vfmsubaddrm_pd(<8 x double> %a0, <8 x double> %a1, <8 x double>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmsubaddrm_pd + ; CHECK: vfmsubadd213pd (%rdi), %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa7,0x07] + %a2 = load <8 x double>* %ptr_a2, align 8 + %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind + ret <8 x double> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_r + ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rz + ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2] + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk + ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] + %a2 = load <4 x float>* %ptr_a2 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka + ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] + %a2 = load <4 x float>* %ptr_a2, align 8 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz + ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07] + %a2 = load <4 x float>* %ptr_a2 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza + ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07] + %a2 = load <4 x float>* %ptr_a2, align 4 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmb + ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07] + %q = load float* %ptr_a2 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmba + ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07] + %q = load float* %ptr_a2, align 4 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz + ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] + %q = load float* %ptr_a2 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind + ret <4 x float> %res +} + +define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza + ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] + %q = load float* %ptr_a2, align 4 + %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 + %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind + ret <4 x float> %res +} + +define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_pd_r + ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { + ; CHECK-LABEL: test_mask_vfmadd128_pd_rz + ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2] + %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk + ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07] + %a2 = load <2 x double>* %ptr_a2 + %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind + ret <2 x double> %res +} + +define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz + ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0x07] + %a2 = load <2 x double>* %ptr_a2 + %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind + ret <2 x double> %res +} + +define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd256_pd_r + ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { + ; CHECK-LABEL: test_mask_vfmadd256_pd_rz + ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2] + %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) { + ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk + ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07] + %a2 = load <4 x double>* %ptr_a2 + %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind + ret <4 x double> %res +} + +define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) { + ; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz + ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0x07] + %a2 = load <4 x double>* %ptr_a2 + %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind + ret <4 x double> %res +} diff --git a/test/CodeGen/X86/avx512bwvl-mov.ll b/test/CodeGen/X86/avx512bwvl-mov.ll new file mode 100644 index 000000000000..835844fc821c --- /dev/null +++ b/test/CodeGen/X86/avx512bwvl-mov.ll @@ -0,0 +1,162 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s + +; CHECK-LABEL: test_256_1 +; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62 +; CHECK: ret +define <32 x i8> @test_256_1(i8 * %addr) { + %vaddr = bitcast i8* %addr to <32 x i8>* + %res = load <32 x i8>* %vaddr, align 1 + ret <32 x i8>%res +} + +; CHECK-LABEL: test_256_2 +; CHECK: vmovdqu8{{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_256_2(i8 * %addr, <32 x i8> %data) { + %vaddr = bitcast i8* %addr to <32 x i8>* + store <32 x i8>%data, <32 x i8>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_256_3 +; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62 +; CHECK: ret +define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) { + %mask = icmp ne <32 x i8> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <32 x i8>* + %r = load <32 x i8>* %vaddr, align 1 + %res = select <32 x i1> %mask, <32 x i8> %r, <32 x i8> %old + ret <32 x i8>%res +} + +; CHECK-LABEL: test_256_4 +; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62 +; CHECK: ret +define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) { + %mask = icmp ne <32 x i8> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <32 x i8>* + %r = load <32 x i8>* %vaddr, align 1 + %res = select <32 x i1> %mask, <32 x i8> %r, <32 x i8> zeroinitializer + ret <32 x i8>%res +} + +; CHECK-LABEL: test_256_5 +; CHECK: vmovdqu16{{.*}} ## encoding: [0x62 +; CHECK: ret +define <16 x i16> @test_256_5(i8 * %addr) { + %vaddr = bitcast i8* %addr to <16 x i16>* + %res = load <16 x i16>* %vaddr, align 1 + ret <16 x i16>%res +} + +; CHECK-LABEL: test_256_6 +; CHECK: vmovdqu16{{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_256_6(i8 * %addr, <16 x i16> %data) { + %vaddr = bitcast i8* %addr to <16 x i16>* + store <16 x i16>%data, <16 x i16>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_256_7 +; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62 +; CHECK: ret +define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) { + %mask = icmp ne <16 x i16> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i16>* + %r = load <16 x i16>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i16> %r, <16 x i16> %old + ret <16 x i16>%res +} + +; CHECK-LABEL: test_256_8 +; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62 +; CHECK: ret +define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) { + %mask = icmp ne <16 x i16> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i16>* + %r = load <16 x i16>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i16> %r, <16 x i16> zeroinitializer + ret <16 x i16>%res +} + +; CHECK-LABEL: test_128_1 +; CHECK: vmovdqu8 {{.*}} ## encoding: [0x62 +; CHECK: ret +define <16 x i8> @test_128_1(i8 * %addr) { + %vaddr = bitcast i8* %addr to <16 x i8>* + %res = load <16 x i8>* %vaddr, align 1 + ret <16 x i8>%res +} + +; CHECK-LABEL: test_128_2 +; CHECK: vmovdqu8{{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_128_2(i8 * %addr, <16 x i8> %data) { + %vaddr = bitcast i8* %addr to <16 x i8>* + store <16 x i8>%data, <16 x i8>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_128_3 +; CHECK: vmovdqu8{{.*{%k[1-7]} }}## encoding: [0x62 +; CHECK: ret +define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) { + %mask = icmp ne <16 x i8> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i8>* + %r = load <16 x i8>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i8> %r, <16 x i8> %old + ret <16 x i8>%res +} + +; CHECK-LABEL: test_128_4 +; CHECK: vmovdqu8{{.*{%k[1-7]} {z} }}## encoding: [0x62 +; CHECK: ret +define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) { + %mask = icmp ne <16 x i8> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <16 x i8>* + %r = load <16 x i8>* %vaddr, align 1 + %res = select <16 x i1> %mask, <16 x i8> %r, <16 x i8> zeroinitializer + ret <16 x i8>%res +} + +; CHECK-LABEL: test_128_5 +; CHECK: vmovdqu16{{.*}} ## encoding: [0x62 +; CHECK: ret +define <8 x i16> @test_128_5(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x i16>* + %res = load <8 x i16>* %vaddr, align 1 + ret <8 x i16>%res +} + +; CHECK-LABEL: test_128_6 +; CHECK: vmovdqu16{{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_128_6(i8 * %addr, <8 x i16> %data) { + %vaddr = bitcast i8* %addr to <8 x i16>* + store <8 x i16>%data, <8 x i16>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_128_7 +; CHECK: vmovdqu16{{.*{%k[1-7]} }}## encoding: [0x62 +; CHECK: ret +define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) { + %mask = icmp ne <8 x i16> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i16>* + %r = load <8 x i16>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i16> %r, <8 x i16> %old + ret <8 x i16>%res +} + +; CHECK-LABEL: test_128_8 +; CHECK: vmovdqu16{{.*{%k[1-7]} {z} }}## encoding: [0x62 +; CHECK: ret +define <8 x i16> @test_128_8(i8 * %addr, <8 x i16> %mask1) { + %mask = icmp ne <8 x i16> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i16>* + %r = load <8 x i16>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i16> %r, <8 x i16> zeroinitializer + ret <8 x i16>%res +} + diff --git a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll new file mode 100644 index 000000000000..2d13a166a725 --- /dev/null +++ b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll @@ -0,0 +1,269 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s + +; CHECK-LABEL: test256_1 +; CHECK: vpcmpeqb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind { + %mask = icmp eq <32 x i8> %x, %y + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y + ret <32 x i8> %max +} + +; CHECK-LABEL: test256_2 +; CHECK: vpcmpgtb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind { + %mask = icmp sgt <32 x i8> %x, %y + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 + ret <32 x i8> %max +} + +; CHECK-LABEL: @test256_3 +; CHECK: vpcmplew {{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounwind { + %mask = icmp sge <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y + ret <16 x i16> %max +} + +; CHECK-LABEL: test256_4 +; CHECK: vpcmpnleub {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind { + %mask = icmp ugt <32 x i8> %x, %y + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 + ret <32 x i8> %max +} + +; CHECK-LABEL: test256_5 +; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nounwind { + %y = load <16 x i16>* %yp, align 4 + %mask = icmp eq <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_6 +; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind { + %y = load <16 x i16>* %y.ptr, align 4 + %mask = icmp sgt <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_7 +; CHECK: vpcmplew (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind { + %y = load <16 x i16>* %y.ptr, align 4 + %mask = icmp sle <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_8 +; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind { + %y = load <16 x i16>* %y.ptr, align 4 + %mask = icmp ule <16 x i16> %x, %y + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_9 +; CHECK: vpcmpeqw %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x i16> %y1) nounwind { + %mask1 = icmp eq <16 x i16> %x1, %y1 + %mask0 = icmp eq <16 x i16> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %y + ret <16 x i16> %max +} + +; CHECK-LABEL: @test256_10 +; CHECK: vpcmpleb %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8> %y1) nounwind { + %mask1 = icmp sge <32 x i8> %x1, %y1 + %mask0 = icmp sle <32 x i8> %x, %y + %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 + ret <32 x i8> %max +} + +; CHECK-LABEL: @test256_11 +; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32 x i8> %y1) nounwind { + %mask1 = icmp sgt <32 x i8> %x1, %y1 + %y = load <32 x i8>* %y.ptr, align 4 + %mask0 = icmp sgt <32 x i8> %x, %y + %mask = select <32 x i1> %mask0, <32 x i1> %mask1, <32 x i1> zeroinitializer + %max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1 + ret <32 x i8> %max +} + +; CHECK-LABEL: @test256_12 +; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1, <16 x i16> %y1) nounwind { + %mask1 = icmp sge <16 x i16> %x1, %y1 + %y = load <16 x i16>* %y.ptr, align 4 + %mask0 = icmp ule <16 x i16> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %x1 + ret <16 x i16> %max +} + +; CHECK-LABEL: test128_1 +; CHECK: vpcmpeqb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind { + %mask = icmp eq <16 x i8> %x, %y + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y + ret <16 x i8> %max +} + +; CHECK-LABEL: test128_2 +; CHECK: vpcmpgtb {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind { + %mask = icmp sgt <16 x i8> %x, %y + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 + ret <16 x i8> %max +} + +; CHECK-LABEL: @test128_3 +; CHECK: vpcmplew {{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind { + %mask = icmp sge <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y + ret <8 x i16> %max +} + +; CHECK-LABEL: test128_4 +; CHECK: vpcmpnleub {{.*%k[0-7]}} +; CHECK: vmovdqu8 {{.*}}%k1 +; CHECK: ret +define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind { + %mask = icmp ugt <16 x i8> %x, %y + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 + ret <16 x i8> %max +} + +; CHECK-LABEL: test128_5 +; CHECK: vpcmpeqw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwind { + %y = load <8 x i16>* %yp, align 4 + %mask = icmp eq <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_6 +; CHECK: vpcmpgtw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind { + %y = load <8 x i16>* %y.ptr, align 4 + %mask = icmp sgt <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_7 +; CHECK: vpcmplew (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind { + %y = load <8 x i16>* %y.ptr, align 4 + %mask = icmp sle <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_8 +; CHECK: vpcmpleuw (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind { + %y = load <8 x i16>* %y.ptr, align 4 + %mask = icmp ule <8 x i16> %x, %y + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_9 +; CHECK: vpcmpeqw %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16> %y1) nounwind { + %mask1 = icmp eq <8 x i16> %x1, %y1 + %mask0 = icmp eq <8 x i16> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y + ret <8 x i16> %max +} + +; CHECK-LABEL: @test128_10 +; CHECK: vpcmpleb %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8> %y1) nounwind { + %mask1 = icmp sge <16 x i8> %x1, %y1 + %mask0 = icmp sle <16 x i8> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 + ret <16 x i8> %max +} + +; CHECK-LABEL: @test128_11 +; CHECK: vpcmpgtb (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu8 +; CHECK: ret +define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16 x i8> %y1) nounwind { + %mask1 = icmp sgt <16 x i8> %x1, %y1 + %y = load <16 x i8>* %y.ptr, align 4 + %mask0 = icmp sgt <16 x i8> %x, %y + %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer + %max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1 + ret <16 x i8> %max +} + +; CHECK-LABEL: @test128_12 +; CHECK: vpcmpleuw (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqu16 +; CHECK: ret +define <8 x i16> @test128_12(<8 x i16> %x, <8 x i16>* %y.ptr, <8 x i16> %x1, <8 x i16> %y1) nounwind { + %mask1 = icmp sge <8 x i16> %x1, %y1 + %y = load <8 x i16>* %y.ptr, align 4 + %mask0 = icmp ule <8 x i16> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %x1 + ret <8 x i16> %max +} diff --git a/test/CodeGen/X86/avx512dq-mask-op.ll b/test/CodeGen/X86/avx512dq-mask-op.ll new file mode 100644 index 000000000000..32a2633f8d06 --- /dev/null +++ b/test/CodeGen/X86/avx512dq-mask-op.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s + +define i8 @mask8(i8 %x) { + %m0 = bitcast i8 %x to <8 x i1> + %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <8 x i1> %m1 to i8 + ret i8 %ret +; CHECK: mask8 +; CHECK: knotb +; CHECK: ret +} + +define void @mask8_mem(i8* %ptr) { + %x = load i8* %ptr, align 4 + %m0 = bitcast i8 %x to <8 x i1> + %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1> + %ret = bitcast <8 x i1> %m1 to i8 + store i8 %ret, i8* %ptr, align 4 + ret void +; CHECK-LABEL: mask8_mem +; CHECK: kmovb ([[ARG1:%rdi|%rcx]]), %k{{[0-7]}} +; CHECK-NEXT: knotb +; CHECK-NEXT: kmovb %k{{[0-7]}}, ([[ARG1]]) +; CHECK: ret +} + +define i8 @mand8(i8 %x, i8 %y) { + %ma = bitcast i8 %x to <8 x i1> + %mb = bitcast i8 %y to <8 x i1> + %mc = and <8 x i1> %ma, %mb + %md = xor <8 x i1> %ma, %mb + %me = or <8 x i1> %mc, %md + %ret = bitcast <8 x i1> %me to i8 +; CHECK: kandb +; CHECK: kxorb +; CHECK: korb + ret i8 %ret +} diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll new file mode 100644 index 000000000000..fa4352e64dce --- /dev/null +++ b/test/CodeGen/X86/avx512er-intrinsics.ll @@ -0,0 +1,116 @@ +; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=knl --show-mc-encoding| FileCheck %s + +define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) { + ; CHECK: vrsqrt28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) { + ; CHECK: kmovw + ; CHECK: vrsqrt28ps %zmm0, %zmm1 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test2_rsqrt28_ps(<16 x float> %a0) { + ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) { + ; CHECK: kmovw + ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 6, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) { + ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8) + ret <16 x float> %res +} + + +declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) { + ; CHECK: vrcp28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) { + ; CHECK: vrcp28pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0] + %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <16 x float> @test_exp2_ps_512(<16 x float> %a0) { + ; CHECK: vexp2ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0] + %res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <8 x double> @test_exp2_pd_512(<8 x double> %a0) { + ; CHECK: vexp2pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0] + %res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) { + ; CHECK: vrsqrt28ss %xmm0, %xmm0, %xmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0] + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_rcp28_ss(<4 x float> %a0) { + ; CHECK: vrcp28ss %xmm0, %xmm0, %xmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0] + %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) { + ; CHECK: vrsqrt28ss %xmm0, %xmm0, %xmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0] + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ; + ret <4 x float> %res +} + +define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) { + ; CHECK: vrsqrt28ss %xmm1, %xmm0, %xmm2 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1] + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ; + ret <4 x float> %res +} + +define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) { + ; CHECK: vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0] + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ; + ret <2 x double> %res +} + +declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone + +define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr ) { + ; CHECK: vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07] + %mem = load double * %ptr, align 8 + %mem_v = insertelement <2 x double> undef, double %mem, i32 0 + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ; + ret <2 x double> %res +} + +define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr ) { + ; CHECK: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12] + %ptr1 = getelementptr double* %ptr, i32 18 + %mem = load double * %ptr1, align 8 + %mem_v = insertelement <2 x double> undef, double %mem, i32 0 + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ; + ret <2 x double> %res +} + diff --git a/test/CodeGen/X86/avx512vl-arith.ll b/test/CodeGen/X86/avx512vl-arith.ll new file mode 100644 index 000000000000..1f7da7814cc9 --- /dev/null +++ b/test/CodeGen/X86/avx512vl-arith.ll @@ -0,0 +1,794 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl| FileCheck %s + +; 256-bit + +; CHECK-LABEL: vpaddq256_test +; CHECK: vpaddq %ymm{{.*}} +; CHECK: ret +define <4 x i64> @vpaddq256_test(<4 x i64> %i, <4 x i64> %j) nounwind readnone { + %x = add <4 x i64> %i, %j + ret <4 x i64> %x +} + +; CHECK-LABEL: vpaddq256_fold_test +; CHECK: vpaddq (%rdi), %ymm{{.*}} +; CHECK: ret +define <4 x i64> @vpaddq256_fold_test(<4 x i64> %i, <4 x i64>* %j) nounwind { + %tmp = load <4 x i64>* %j, align 4 + %x = add <4 x i64> %i, %tmp + ret <4 x i64> %x +} + +; CHECK-LABEL: vpaddq256_broadcast_test +; CHECK: vpaddq LCP{{.*}}(%rip){1to4}, %ymm{{.*}} +; CHECK: ret +define <4 x i64> @vpaddq256_broadcast_test(<4 x i64> %i) nounwind { + %x = add <4 x i64> %i, <i64 1, i64 1, i64 1, i64 1> + ret <4 x i64> %x +} + +; CHECK-LABEL: vpaddq256_broadcast2_test +; CHECK: vpaddq (%rdi){1to4}, %ymm{{.*}} +; CHECK: ret +define <4 x i64> @vpaddq256_broadcast2_test(<4 x i64> %i, i64* %j.ptr) nounwind { + %j = load i64* %j.ptr + %j.0 = insertelement <4 x i64> undef, i64 %j, i32 0 + %j.v = shufflevector <4 x i64> %j.0, <4 x i64> undef, <4 x i32> zeroinitializer + %x = add <4 x i64> %i, %j.v + ret <4 x i64> %x +} + +; CHECK-LABEL: vpaddd256_test +; CHECK: vpaddd %ymm{{.*}} +; CHECK: ret +define <8 x i32> @vpaddd256_test(<8 x i32> %i, <8 x i32> %j) nounwind readnone { + %x = add <8 x i32> %i, %j + ret <8 x i32> %x +} + +; CHECK-LABEL: vpaddd256_fold_test +; CHECK: vpaddd (%rdi), %ymm{{.*}} +; CHECK: ret +define <8 x i32> @vpaddd256_fold_test(<8 x i32> %i, <8 x i32>* %j) nounwind { + %tmp = load <8 x i32>* %j, align 4 + %x = add <8 x i32> %i, %tmp + ret <8 x i32> %x +} + +; CHECK-LABEL: vpaddd256_broadcast_test +; CHECK: vpaddd LCP{{.*}}(%rip){1to8}, %ymm{{.*}} +; CHECK: ret +define <8 x i32> @vpaddd256_broadcast_test(<8 x i32> %i) nounwind { + %x = add <8 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + ret <8 x i32> %x +} + +; CHECK-LABEL: vpaddd256_mask_test +; CHECK: vpaddd %ymm{{.*%k[1-7].*}} +; CHECK: ret +define <8 x i32> @vpaddd256_mask_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mask1) nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %x = add <8 x i32> %i, %j + %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %i + ret <8 x i32> %r +} + +; CHECK-LABEL: vpaddd256_maskz_test +; CHECK: vpaddd %ymm{{.*{%k[1-7]} {z}.*}} +; CHECK: ret +define <8 x i32> @vpaddd256_maskz_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mask1) nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %x = add <8 x i32> %i, %j + %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer + ret <8 x i32> %r +} + +; CHECK-LABEL: vpaddd256_mask_fold_test +; CHECK: vpaddd (%rdi), %ymm{{.*%k[1-7]}} +; CHECK: ret +define <8 x i32> @vpaddd256_mask_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x i32> %mask1) nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %j = load <8 x i32>* %j.ptr + %x = add <8 x i32> %i, %j + %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %i + ret <8 x i32> %r +} + +; CHECK-LABEL: vpaddd256_mask_broadcast_test +; CHECK: vpaddd LCP{{.*}}(%rip){1to8}, %ymm{{.*{%k[1-7]}}} +; CHECK: ret +define <8 x i32> @vpaddd256_mask_broadcast_test(<8 x i32> %i, <8 x i32> %mask1) nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %x = add <8 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %i + ret <8 x i32> %r +} + +; CHECK-LABEL: vpaddd256_maskz_fold_test +; CHECK: vpaddd (%rdi), %ymm{{.*{%k[1-7]} {z}}} +; CHECK: ret +define <8 x i32> @vpaddd256_maskz_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x i32> %mask1) nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %j = load <8 x i32>* %j.ptr + %x = add <8 x i32> %i, %j + %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer + ret <8 x i32> %r +} + +; CHECK-LABEL: vpaddd256_maskz_broadcast_test +; CHECK: vpaddd LCP{{.*}}(%rip){1to8}, %ymm{{.*{%k[1-7]} {z}}} +; CHECK: ret +define <8 x i32> @vpaddd256_maskz_broadcast_test(<8 x i32> %i, <8 x i32> %mask1) nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %x = add <8 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %r = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer + ret <8 x i32> %r +} + +; CHECK-LABEL: vpsubq256_test +; CHECK: vpsubq %ymm{{.*}} +; CHECK: ret +define <4 x i64> @vpsubq256_test(<4 x i64> %i, <4 x i64> %j) nounwind readnone { + %x = sub <4 x i64> %i, %j + ret <4 x i64> %x +} + +; CHECK-LABEL: vpsubd256_test +; CHECK: vpsubd %ymm{{.*}} +; CHECK: ret +define <8 x i32> @vpsubd256_test(<8 x i32> %i, <8 x i32> %j) nounwind readnone { + %x = sub <8 x i32> %i, %j + ret <8 x i32> %x +} + +; CHECK-LABEL: vpmulld256_test +; CHECK: vpmulld %ymm{{.*}} +; CHECK: ret +define <8 x i32> @vpmulld256_test(<8 x i32> %i, <8 x i32> %j) { + %x = mul <8 x i32> %i, %j + ret <8 x i32> %x +} + +; CHECK-LABEL: test_vaddpd_256 +; CHECK: vaddpd{{.*}} +; CHECK: ret +define <4 x double> @test_vaddpd_256(<4 x double> %y, <4 x double> %x) { +entry: + %add.i = fadd <4 x double> %x, %y + ret <4 x double> %add.i +} + +; CHECK-LABEL: test_fold_vaddpd_256 +; CHECK: vaddpd LCP{{.*}}(%rip){{.*}} +; CHECK: ret +define <4 x double> @test_fold_vaddpd_256(<4 x double> %y) { +entry: + %add.i = fadd <4 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 4.500000e+00, double 5.600000e+00> + ret <4 x double> %add.i +} + +; CHECK-LABEL: test_broadcast_vaddpd_256 +; CHECK: LCP{{.*}}(%rip){1to8}, %ymm0, %ymm0 +; CHECK: ret +define <8 x float> @test_broadcast_vaddpd_256(<8 x float> %a) nounwind { + %b = fadd <8 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000> + ret <8 x float> %b +} + +; CHECK-LABEL: test_mask_vaddps_256 +; CHECK: vaddps {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <8 x float> @test_mask_vaddps_256(<8 x float> %dst, <8 x float> %i, + <8 x float> %j, <8 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %x = fadd <8 x float> %i, %j + %r = select <8 x i1> %mask, <8 x float> %x, <8 x float> %dst + ret <8 x float> %r +} + +; CHECK-LABEL: test_mask_vmulps_256 +; CHECK: vmulps {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <8 x float> @test_mask_vmulps_256(<8 x float> %dst, <8 x float> %i, + <8 x float> %j, <8 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %x = fmul <8 x float> %i, %j + %r = select <8 x i1> %mask, <8 x float> %x, <8 x float> %dst + ret <8 x float> %r +} + +; CHECK-LABEL: test_mask_vminps_256 +; CHECK: vminps {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <8 x float> @test_mask_vminps_256(<8 x float> %dst, <8 x float> %i, + <8 x float> %j, <8 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %cmp_res = fcmp olt <8 x float> %i, %j + %min = select <8 x i1> %cmp_res, <8 x float> %i, <8 x float> %j + %r = select <8 x i1> %mask, <8 x float> %min, <8 x float> %dst + ret <8 x float> %r +} + +; CHECK-LABEL: test_mask_vmaxps_256 +; CHECK: vmaxps {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <8 x float> @test_mask_vmaxps_256(<8 x float> %dst, <8 x float> %i, + <8 x float> %j, <8 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %cmp_res = fcmp ogt <8 x float> %i, %j + %max = select <8 x i1> %cmp_res, <8 x float> %i, <8 x float> %j + %r = select <8 x i1> %mask, <8 x float> %max, <8 x float> %dst + ret <8 x float> %r +} + +; CHECK-LABEL: test_mask_vsubps_256 +; CHECK: vsubps {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <8 x float> @test_mask_vsubps_256(<8 x float> %dst, <8 x float> %i, + <8 x float> %j, <8 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %x = fsub <8 x float> %i, %j + %r = select <8 x i1> %mask, <8 x float> %x, <8 x float> %dst + ret <8 x float> %r +} + +; CHECK-LABEL: test_mask_vdivps_256 +; CHECK: vdivps {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <8 x float> @test_mask_vdivps_256(<8 x float> %dst, <8 x float> %i, + <8 x float> %j, <8 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %x = fdiv <8 x float> %i, %j + %r = select <8 x i1> %mask, <8 x float> %x, <8 x float> %dst + ret <8 x float> %r +} + +; CHECK-LABEL: test_mask_vmulpd_256 +; CHECK: vmulpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <4 x double> @test_mask_vmulpd_256(<4 x double> %dst, <4 x double> %i, + <4 x double> %j, <4 x i64> %mask1) + nounwind readnone { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %x = fmul <4 x double> %i, %j + %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> %dst + ret <4 x double> %r +} + +; CHECK-LABEL: test_mask_vminpd_256 +; CHECK: vminpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <4 x double> @test_mask_vminpd_256(<4 x double> %dst, <4 x double> %i, + <4 x double> %j, <4 x i64> %mask1) + nounwind readnone { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %cmp_res = fcmp olt <4 x double> %i, %j + %min = select <4 x i1> %cmp_res, <4 x double> %i, <4 x double> %j + %r = select <4 x i1> %mask, <4 x double> %min, <4 x double> %dst + ret <4 x double> %r +} + +; CHECK-LABEL: test_mask_vmaxpd_256 +; CHECK: vmaxpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <4 x double> @test_mask_vmaxpd_256(<4 x double> %dst, <4 x double> %i, + <4 x double> %j, <4 x i64> %mask1) + nounwind readnone { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %cmp_res = fcmp ogt <4 x double> %i, %j + %max = select <4 x i1> %cmp_res, <4 x double> %i, <4 x double> %j + %r = select <4 x i1> %mask, <4 x double> %max, <4 x double> %dst + ret <4 x double> %r +} + +; CHECK-LABEL: test_mask_vsubpd_256 +; CHECK: vsubpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <4 x double> @test_mask_vsubpd_256(<4 x double> %dst, <4 x double> %i, + <4 x double> %j, <4 x i64> %mask1) + nounwind readnone { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %x = fsub <4 x double> %i, %j + %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> %dst + ret <4 x double> %r +} + +; CHECK-LABEL: test_mask_vdivpd_256 +; CHECK: vdivpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <4 x double> @test_mask_vdivpd_256(<4 x double> %dst, <4 x double> %i, + <4 x double> %j, <4 x i64> %mask1) + nounwind readnone { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %x = fdiv <4 x double> %i, %j + %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> %dst + ret <4 x double> %r +} + +; CHECK-LABEL: test_mask_vaddpd_256 +; CHECK: vaddpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <4 x double> @test_mask_vaddpd_256(<4 x double> %dst, <4 x double> %i, + <4 x double> %j, <4 x i64> %mask1) + nounwind readnone { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %x = fadd <4 x double> %i, %j + %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> %dst + ret <4 x double> %r +} + +; CHECK-LABEL: test_maskz_vaddpd_256 +; CHECK: vaddpd {{%ymm[0-9]{1,2}, %ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]} {z}}} +; CHECK: ret +define <4 x double> @test_maskz_vaddpd_256(<4 x double> %i, <4 x double> %j, + <4 x i64> %mask1) nounwind readnone { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %x = fadd <4 x double> %i, %j + %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> zeroinitializer + ret <4 x double> %r +} + +; CHECK-LABEL: test_mask_fold_vaddpd_256 +; CHECK: vaddpd (%rdi), {{.*%ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]}.*}} +; CHECK: ret +define <4 x double> @test_mask_fold_vaddpd_256(<4 x double> %dst, <4 x double> %i, + <4 x double>* %j, <4 x i64> %mask1) + nounwind { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %tmp = load <4 x double>* %j + %x = fadd <4 x double> %i, %tmp + %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> %dst + ret <4 x double> %r +} + +; CHECK-LABEL: test_maskz_fold_vaddpd_256 +; CHECK: vaddpd (%rdi), {{.*%ymm[0-9]{1,2}, %ymm[0-9]{1,2} {%k[1-7]} {z}.*}} +; CHECK: ret +define <4 x double> @test_maskz_fold_vaddpd_256(<4 x double> %i, <4 x double>* %j, + <4 x i64> %mask1) nounwind { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %tmp = load <4 x double>* %j + %x = fadd <4 x double> %i, %tmp + %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> zeroinitializer + ret <4 x double> %r +} + +; CHECK-LABEL: test_broadcast2_vaddpd_256 +; CHECK: vaddpd (%rdi){1to4}, %ymm{{.*}} +; CHECK: ret +define <4 x double> @test_broadcast2_vaddpd_256(<4 x double> %i, double* %j) nounwind { + %tmp = load double* %j + %b = insertelement <4 x double> undef, double %tmp, i32 0 + %c = shufflevector <4 x double> %b, <4 x double> undef, + <4 x i32> zeroinitializer + %x = fadd <4 x double> %c, %i + ret <4 x double> %x +} + +; CHECK-LABEL: test_mask_broadcast_vaddpd_256 +; CHECK: vaddpd (%rdi){1to4}, %ymm{{.*{%k[1-7]}.*}} +; CHECK: ret +define <4 x double> @test_mask_broadcast_vaddpd_256(<4 x double> %dst, <4 x double> %i, + double* %j, <4 x i64> %mask1) nounwind { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %tmp = load double* %j + %b = insertelement <4 x double> undef, double %tmp, i32 0 + %c = shufflevector <4 x double> %b, <4 x double> undef, + <4 x i32> zeroinitializer + %x = fadd <4 x double> %c, %i + %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> %i + ret <4 x double> %r +} + +; CHECK-LABEL: test_maskz_broadcast_vaddpd_256 +; CHECK: vaddpd (%rdi){1to4}, %ymm{{.*{%k[1-7]} {z}.*}} +; CHECK: ret +define <4 x double> @test_maskz_broadcast_vaddpd_256(<4 x double> %i, double* %j, + <4 x i64> %mask1) nounwind { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %tmp = load double* %j + %b = insertelement <4 x double> undef, double %tmp, i32 0 + %c = shufflevector <4 x double> %b, <4 x double> undef, + <4 x i32> zeroinitializer + %x = fadd <4 x double> %c, %i + %r = select <4 x i1> %mask, <4 x double> %x, <4 x double> zeroinitializer + ret <4 x double> %r +} + +; 128-bit + +; CHECK-LABEL: vpaddq128_test +; CHECK: vpaddq %xmm{{.*}} +; CHECK: ret +define <2 x i64> @vpaddq128_test(<2 x i64> %i, <2 x i64> %j) nounwind readnone { + %x = add <2 x i64> %i, %j + ret <2 x i64> %x +} + +; CHECK-LABEL: vpaddq128_fold_test +; CHECK: vpaddq (%rdi), %xmm{{.*}} +; CHECK: ret +define <2 x i64> @vpaddq128_fold_test(<2 x i64> %i, <2 x i64>* %j) nounwind { + %tmp = load <2 x i64>* %j, align 4 + %x = add <2 x i64> %i, %tmp + ret <2 x i64> %x +} + +; CHECK-LABEL: vpaddq128_broadcast2_test +; CHECK: vpaddq (%rdi){1to2}, %xmm{{.*}} +; CHECK: ret +define <2 x i64> @vpaddq128_broadcast2_test(<2 x i64> %i, i64* %j) nounwind { + %tmp = load i64* %j + %j.0 = insertelement <2 x i64> undef, i64 %tmp, i32 0 + %j.1 = insertelement <2 x i64> %j.0, i64 %tmp, i32 1 + %x = add <2 x i64> %i, %j.1 + ret <2 x i64> %x +} + +; CHECK-LABEL: vpaddd128_test +; CHECK: vpaddd %xmm{{.*}} +; CHECK: ret +define <4 x i32> @vpaddd128_test(<4 x i32> %i, <4 x i32> %j) nounwind readnone { + %x = add <4 x i32> %i, %j + ret <4 x i32> %x +} + +; CHECK-LABEL: vpaddd128_fold_test +; CHECK: vpaddd (%rdi), %xmm{{.*}} +; CHECK: ret +define <4 x i32> @vpaddd128_fold_test(<4 x i32> %i, <4 x i32>* %j) nounwind { + %tmp = load <4 x i32>* %j, align 4 + %x = add <4 x i32> %i, %tmp + ret <4 x i32> %x +} + +; CHECK-LABEL: vpaddd128_broadcast_test +; CHECK: vpaddd LCP{{.*}}(%rip){1to4}, %xmm{{.*}} +; CHECK: ret +define <4 x i32> @vpaddd128_broadcast_test(<4 x i32> %i) nounwind { + %x = add <4 x i32> %i, <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %x +} + +; CHECK-LABEL: vpaddd128_mask_test +; CHECK: vpaddd %xmm{{.*%k[1-7].*}} +; CHECK: ret +define <4 x i32> @vpaddd128_mask_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask1) nounwind readnone { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %x = add <4 x i32> %i, %j + %r = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %i + ret <4 x i32> %r +} + +; CHECK-LABEL: vpaddd128_maskz_test +; CHECK: vpaddd %xmm{{.*{%k[1-7]} {z}.*}} +; CHECK: ret +define <4 x i32> @vpaddd128_maskz_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask1) nounwind readnone { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %x = add <4 x i32> %i, %j + %r = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer + ret <4 x i32> %r +} + +; CHECK-LABEL: vpaddd128_mask_fold_test +; CHECK: vpaddd (%rdi), %xmm{{.*%k[1-7]}} +; CHECK: ret +define <4 x i32> @vpaddd128_mask_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x i32> %mask1) nounwind readnone { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %j = load <4 x i32>* %j.ptr + %x = add <4 x i32> %i, %j + %r = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %i + ret <4 x i32> %r +} + +; CHECK-LABEL: vpaddd128_mask_broadcast_test +; CHECK: vpaddd LCP{{.*}}(%rip){1to4}, %xmm{{.*{%k[1-7]}}} +; CHECK: ret +define <4 x i32> @vpaddd128_mask_broadcast_test(<4 x i32> %i, <4 x i32> %mask1) nounwind readnone { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %x = add <4 x i32> %i, <i32 1, i32 1, i32 1, i32 1> + %r = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %i + ret <4 x i32> %r +} + +; CHECK-LABEL: vpaddd128_maskz_fold_test +; CHECK: vpaddd (%rdi), %xmm{{.*{%k[1-7]} {z}}} +; CHECK: ret +define <4 x i32> @vpaddd128_maskz_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x i32> %mask1) nounwind readnone { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %j = load <4 x i32>* %j.ptr + %x = add <4 x i32> %i, %j + %r = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer + ret <4 x i32> %r +} + +; CHECK-LABEL: vpaddd128_maskz_broadcast_test +; CHECK: vpaddd LCP{{.*}}(%rip){1to4}, %xmm{{.*{%k[1-7]} {z}}} +; CHECK: ret +define <4 x i32> @vpaddd128_maskz_broadcast_test(<4 x i32> %i, <4 x i32> %mask1) nounwind readnone { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %x = add <4 x i32> %i, <i32 1, i32 1, i32 1, i32 1> + %r = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer + ret <4 x i32> %r +} + +; CHECK-LABEL: vpsubq128_test +; CHECK: vpsubq %xmm{{.*}} +; CHECK: ret +define <2 x i64> @vpsubq128_test(<2 x i64> %i, <2 x i64> %j) nounwind readnone { + %x = sub <2 x i64> %i, %j + ret <2 x i64> %x +} + +; CHECK-LABEL: vpsubd128_test +; CHECK: vpsubd %xmm{{.*}} +; CHECK: ret +define <4 x i32> @vpsubd128_test(<4 x i32> %i, <4 x i32> %j) nounwind readnone { + %x = sub <4 x i32> %i, %j + ret <4 x i32> %x +} + +; CHECK-LABEL: vpmulld128_test +; CHECK: vpmulld %xmm{{.*}} +; CHECK: ret +define <4 x i32> @vpmulld128_test(<4 x i32> %i, <4 x i32> %j) { + %x = mul <4 x i32> %i, %j + ret <4 x i32> %x +} + +; CHECK-LABEL: test_vaddpd_128 +; CHECK: vaddpd{{.*}} +; CHECK: ret +define <2 x double> @test_vaddpd_128(<2 x double> %y, <2 x double> %x) { +entry: + %add.i = fadd <2 x double> %x, %y + ret <2 x double> %add.i +} + +; CHECK-LABEL: test_fold_vaddpd_128 +; CHECK: vaddpd LCP{{.*}}(%rip){{.*}} +; CHECK: ret +define <2 x double> @test_fold_vaddpd_128(<2 x double> %y) { +entry: + %add.i = fadd <2 x double> %y, <double 4.500000e+00, double 3.400000e+00> + ret <2 x double> %add.i +} + +; CHECK-LABEL: test_broadcast_vaddpd_128 +; CHECK: LCP{{.*}}(%rip){1to4}, %xmm0, %xmm0 +; CHECK: ret +define <4 x float> @test_broadcast_vaddpd_128(<4 x float> %a) nounwind { + %b = fadd <4 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000> + ret <4 x float> %b +} + +; CHECK-LABEL: test_mask_vaddps_128 +; CHECK: vaddps {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <4 x float> @test_mask_vaddps_128(<4 x float> %dst, <4 x float> %i, + <4 x float> %j, <4 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %x = fadd <4 x float> %i, %j + %r = select <4 x i1> %mask, <4 x float> %x, <4 x float> %dst + ret <4 x float> %r +} + +; CHECK-LABEL: test_mask_vmulps_128 +; CHECK: vmulps {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <4 x float> @test_mask_vmulps_128(<4 x float> %dst, <4 x float> %i, + <4 x float> %j, <4 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %x = fmul <4 x float> %i, %j + %r = select <4 x i1> %mask, <4 x float> %x, <4 x float> %dst + ret <4 x float> %r +} + +; CHECK-LABEL: test_mask_vminps_128 +; CHECK: vminps {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <4 x float> @test_mask_vminps_128(<4 x float> %dst, <4 x float> %i, + <4 x float> %j, <4 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %cmp_res = fcmp olt <4 x float> %i, %j + %min = select <4 x i1> %cmp_res, <4 x float> %i, <4 x float> %j + %r = select <4 x i1> %mask, <4 x float> %min, <4 x float> %dst + ret <4 x float> %r +} + +; CHECK-LABEL: test_mask_vmaxps_128 +; CHECK: vmaxps {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <4 x float> @test_mask_vmaxps_128(<4 x float> %dst, <4 x float> %i, + <4 x float> %j, <4 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %cmp_res = fcmp ogt <4 x float> %i, %j + %max = select <4 x i1> %cmp_res, <4 x float> %i, <4 x float> %j + %r = select <4 x i1> %mask, <4 x float> %max, <4 x float> %dst + ret <4 x float> %r +} + +; CHECK-LABEL: test_mask_vsubps_128 +; CHECK: vsubps {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <4 x float> @test_mask_vsubps_128(<4 x float> %dst, <4 x float> %i, + <4 x float> %j, <4 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %x = fsub <4 x float> %i, %j + %r = select <4 x i1> %mask, <4 x float> %x, <4 x float> %dst + ret <4 x float> %r +} + + +; CHECK-LABEL: test_mask_vdivps_128 +; CHECK: vdivps {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <4 x float> @test_mask_vdivps_128(<4 x float> %dst, <4 x float> %i, + <4 x float> %j, <4 x i32> %mask1) + nounwind readnone { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %x = fdiv <4 x float> %i, %j + %r = select <4 x i1> %mask, <4 x float> %x, <4 x float> %dst + ret <4 x float> %r +} + +; CHECK-LABEL: test_mask_vmulpd_128 +; CHECK: vmulpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <2 x double> @test_mask_vmulpd_128(<2 x double> %dst, <2 x double> %i, + <2 x double> %j, <2 x i64> %mask1) + nounwind readnone { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %x = fmul <2 x double> %i, %j + %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> %dst + ret <2 x double> %r +} + +; CHECK-LABEL: test_mask_vminpd_128 +; CHECK: vminpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <2 x double> @test_mask_vminpd_128(<2 x double> %dst, <2 x double> %i, + <2 x double> %j, <2 x i64> %mask1) + nounwind readnone { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %cmp_res = fcmp olt <2 x double> %i, %j + %min = select <2 x i1> %cmp_res, <2 x double> %i, <2 x double> %j + %r = select <2 x i1> %mask, <2 x double> %min, <2 x double> %dst + ret <2 x double> %r +} + +; CHECK-LABEL: test_mask_vmaxpd_128 +; CHECK: vmaxpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <2 x double> @test_mask_vmaxpd_128(<2 x double> %dst, <2 x double> %i, + <2 x double> %j, <2 x i64> %mask1) + nounwind readnone { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %cmp_res = fcmp ogt <2 x double> %i, %j + %max = select <2 x i1> %cmp_res, <2 x double> %i, <2 x double> %j + %r = select <2 x i1> %mask, <2 x double> %max, <2 x double> %dst + ret <2 x double> %r +} + +; CHECK-LABEL: test_mask_vsubpd_128 +; CHECK: vsubpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <2 x double> @test_mask_vsubpd_128(<2 x double> %dst, <2 x double> %i, + <2 x double> %j, <2 x i64> %mask1) + nounwind readnone { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %x = fsub <2 x double> %i, %j + %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> %dst + ret <2 x double> %r +} + +; CHECK-LABEL: test_mask_vdivpd_128 +; CHECK: vdivpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <2 x double> @test_mask_vdivpd_128(<2 x double> %dst, <2 x double> %i, + <2 x double> %j, <2 x i64> %mask1) + nounwind readnone { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %x = fdiv <2 x double> %i, %j + %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> %dst + ret <2 x double> %r +} + +; CHECK-LABEL: test_mask_vaddpd_128 +; CHECK: vaddpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}}} +; CHECK: ret +define <2 x double> @test_mask_vaddpd_128(<2 x double> %dst, <2 x double> %i, + <2 x double> %j, <2 x i64> %mask1) + nounwind readnone { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %x = fadd <2 x double> %i, %j + %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> %dst + ret <2 x double> %r +} + +; CHECK-LABEL: test_maskz_vaddpd_128 +; CHECK: vaddpd {{%xmm[0-9]{1,2}, %xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]} {z}}} +; CHECK: ret +define <2 x double> @test_maskz_vaddpd_128(<2 x double> %i, <2 x double> %j, + <2 x i64> %mask1) nounwind readnone { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %x = fadd <2 x double> %i, %j + %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> zeroinitializer + ret <2 x double> %r +} + +; CHECK-LABEL: test_mask_fold_vaddpd_128 +; CHECK: vaddpd (%rdi), {{.*%xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]}.*}} +; CHECK: ret +define <2 x double> @test_mask_fold_vaddpd_128(<2 x double> %dst, <2 x double> %i, + <2 x double>* %j, <2 x i64> %mask1) + nounwind { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %tmp = load <2 x double>* %j + %x = fadd <2 x double> %i, %tmp + %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> %dst + ret <2 x double> %r +} + +; CHECK-LABEL: test_maskz_fold_vaddpd_128 +; CHECK: vaddpd (%rdi), {{.*%xmm[0-9]{1,2}, %xmm[0-9]{1,2} {%k[1-7]} {z}.*}} +; CHECK: ret +define <2 x double> @test_maskz_fold_vaddpd_128(<2 x double> %i, <2 x double>* %j, + <2 x i64> %mask1) nounwind { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %tmp = load <2 x double>* %j + %x = fadd <2 x double> %i, %tmp + %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> zeroinitializer + ret <2 x double> %r +} + +; CHECK-LABEL: test_broadcast2_vaddpd_128 +; CHECK: vaddpd (%rdi){1to2}, %xmm{{.*}} +; CHECK: ret +define <2 x double> @test_broadcast2_vaddpd_128(<2 x double> %i, double* %j) nounwind { + %tmp = load double* %j + %j.0 = insertelement <2 x double> undef, double %tmp, i64 0 + %j.1 = insertelement <2 x double> %j.0, double %tmp, i64 1 + %x = fadd <2 x double> %j.1, %i + ret <2 x double> %x +} + +; CHECK-LABEL: test_mask_broadcast_vaddpd_128 +; CHECK: vaddpd (%rdi){1to2}, %xmm{{.*{%k[1-7]}.*}} +; CHECK: ret +define <2 x double> @test_mask_broadcast_vaddpd_128(<2 x double> %dst, <2 x double> %i, + double* %j, <2 x i64> %mask1) + nounwind { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %tmp = load double* %j + %j.0 = insertelement <2 x double> undef, double %tmp, i64 0 + %j.1 = insertelement <2 x double> %j.0, double %tmp, i64 1 + %x = fadd <2 x double> %j.1, %i + %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> %i + ret <2 x double> %r +} + +; CHECK-LABEL: test_maskz_broadcast_vaddpd_128 +; CHECK: vaddpd (%rdi){1to2}, %xmm{{.*{%k[1-7]} {z}.*}} +; CHECK: ret +define <2 x double> @test_maskz_broadcast_vaddpd_128(<2 x double> %i, double* %j, + <2 x i64> %mask1) nounwind { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %tmp = load double* %j + %j.0 = insertelement <2 x double> undef, double %tmp, i64 0 + %j.1 = insertelement <2 x double> %j.0, double %tmp, i64 1 + %x = fadd <2 x double> %j.1, %i + %r = select <2 x i1> %mask, <2 x double> %x, <2 x double> zeroinitializer + ret <2 x double> %r +} diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll new file mode 100644 index 000000000000..d349f4f53786 --- /dev/null +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -0,0 +1,864 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s + +; 256-bit + +define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: test_pcmpeq_d_256 +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_d_256 +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8) + +define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: test_pcmpeq_q_256 +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_q_256 +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8) + +define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: test_pcmpgt_d_256 +; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_d_256 +; CHECK: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32>, <8 x i32>, i8) + +define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: test_pcmpgt_q_256 +; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_q_256 +; CHECK: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8) + +define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { +; CHECK-LABEL: test_cmp_d_256 +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltd %ymm1, %ymm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpled %ymm1, %ymm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnled %ymm1, %ymm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordd %ymm1, %ymm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { +; CHECK-LABEL: test_mask_cmp_d_256 +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltd %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpled %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnled %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordd %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone + +define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { +; CHECK-LABEL: test_ucmp_d_256 +; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltud %ymm1, %ymm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleud %ymm1, %ymm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordud %ymm1, %ymm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { +; CHECK-LABEL: test_mask_ucmp_d_256 +; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltud %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleud %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordud %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone + +define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { +; CHECK-LABEL: test_cmp_q_256 +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltq %ymm1, %ymm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleq %ymm1, %ymm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordq %ymm1, %ymm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { +; CHECK-LABEL: test_mask_cmp_q_256 +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltq %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordq %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone + +define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { +; CHECK-LABEL: test_ucmp_q_256 +; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduq %ymm1, %ymm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { +; CHECK-LABEL: test_mask_ucmp_q_256 +; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduq %ymm1, %ymm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone + +; 128-bit + +define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_pcmpeq_d_128 +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_d_128 +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8) + +define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_pcmpeq_q_128 +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpeq_q_128 +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8) + +define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_pcmpgt_d_128 +; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_d_128 +; CHECK: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32>, <4 x i32>, i8) + +define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_pcmpgt_q_128 +; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1) + ret i8 %res +} + +define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_pcmpgt_q_128 +; CHECK: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ## + %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask) + ret i8 %res +} + +declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8) + +define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: test_cmp_d_128 +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltd %xmm1, %xmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpled %xmm1, %xmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnled %xmm1, %xmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordd %xmm1, %xmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { +; CHECK-LABEL: test_mask_cmp_d_128 +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltd %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpled %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnled %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordd %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone + +define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: test_ucmp_d_128 +; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltud %xmm1, %xmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleud %xmm1, %xmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordud %xmm1, %xmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { +; CHECK-LABEL: test_mask_ucmp_d_128 +; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltud %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleud %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordud %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone + +define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { +; CHECK-LABEL: test_cmp_q_128 +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltq %xmm1, %xmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleq %xmm1, %xmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordq %xmm1, %xmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { +; CHECK-LABEL: test_mask_cmp_q_128 +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltq %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleq %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmpordq %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone + +define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { +; CHECK-LABEL: test_ucmp_q_128 +; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduq %xmm1, %xmm0, %k0 ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { +; CHECK-LABEL: test_mask_ucmp_q_128 +; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ## + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask) + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 +; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} ## + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask) + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 +; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 {%k1} ## + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask) + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 +; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 {%k1} ## + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask) + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 +; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 {%k1} ## + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask) + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 +; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} ## + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask) + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 +; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} ## + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask) + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 +; CHECK: vpcmporduq %xmm1, %xmm0, %k0 {%k1} ## + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask) + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 + ret <8 x i8> %vec7 +} + +declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone + +; CHECK-LABEL: compr1 +; CHECK: vcompresspd %zmm0 +define void @compr1(i8* %addr, <8 x double> %data, i8 %mask) { + call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask) + +; CHECK-LABEL: compr2 +; CHECK: vcompresspd %ymm0 +define void @compr2(i8* %addr, <4 x double> %data, i8 %mask) { + call void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask) + +; CHECK-LABEL: compr3 +; CHECK: vcompressps %xmm0 +define void @compr3(i8* %addr, <4 x float> %data, i8 %mask) { + call void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask) + +; CHECK-LABEL: compr4 +; CHECK: vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0] +define <8 x double> @compr4(i8* %addr, <8 x double> %data, i8 %mask) { + %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask) + +; CHECK-LABEL: compr5 +; CHECK: vcompresspd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1] +define <4 x double> @compr5(<4 x double> %data, <4 x double> %src0, i8 %mask) { + %res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask) + +; CHECK-LABEL: compr6 +; CHECK: vcompressps %xmm0 +define <4 x float> @compr6(<4 x float> %data, i8 %mask) { + %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask) + +; CHECK-LABEL: compr7 +; CHECK-NOT: vcompress +; CHECK: vmovapd +define void @compr7(i8* %addr, <8 x double> %data) { + call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1) + ret void +} + +; CHECK-LABEL: compr8 +; CHECK-NOT: vcompressps %xmm0 +define <4 x float> @compr8(<4 x float> %data) { + %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1) + ret <4 x float> %res +} + +; CHECK-LABEL: compr9 +; CHECK: vpcompressq %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07] +define void @compr9(i8* %addr, <8 x i64> %data, i8 %mask) { + call void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask) + ret void +} + +declare void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask) + +; CHECK-LABEL: compr10 +; CHECK: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0] +define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) { + %res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask) + +; Expand + +; CHECK-LABEL: expand1 +; CHECK: vexpandpd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07] +define <8 x double> @expand1(i8* %addr, <8 x double> %data, i8 %mask) { + %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask) + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask) + +; CHECK-LABEL: expand2 +; CHECK: vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07] +define <4 x double> @expand2(i8* %addr, <4 x double> %data, i8 %mask) { + %res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask) + +; CHECK-LABEL: expand3 +; CHECK: vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07] +define <4 x float> @expand3(i8* %addr, <4 x float> %data, i8 %mask) { + %res = call <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask) + +; CHECK-LABEL: expand4 +; CHECK: vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0] +define <8 x double> @expand4(i8* %addr, <8 x double> %data, i8 %mask) { + %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res +} + +declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask) + +; CHECK-LABEL: expand5 +; CHECK: vexpandpd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8] +define <4 x double> @expand5(<4 x double> %data, <4 x double> %src0, i8 %mask) { + %res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask) + +; CHECK-LABEL: expand6 +; CHECK: vexpandps %xmm0 +define <4 x float> @expand6(<4 x float> %data, i8 %mask) { + %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask) + +; CHECK-LABEL: expand7 +; CHECK-NOT: vexpand +; CHECK: vmovapd +define <8 x double> @expand7(i8* %addr, <8 x double> %data) { + %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1) + ret <8 x double> %res +} + +; CHECK-LABEL: expand8 +; CHECK-NOT: vexpandps %xmm0 +define <4 x float> @expand8(<4 x float> %data) { + %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1) + ret <4 x float> %res +} + +; CHECK-LABEL: expand9 +; CHECK: vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07] +define <8 x i64> @expand9(i8* %addr, <8 x i64> %data, i8 %mask) { + %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask) + +; CHECK-LABEL: expand10 +; CHECK: vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0] +define <4 x i32> @expand10(<4 x i32> %data, i8 %mask) { + %res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask) + +define <8 x float> @test_x86_mask_blend_ps_256(i8 %a0, <8 x float> %a1, <8 x float> %a2) { + ; CHECK: vblendmps %ymm1, %ymm0 + %res = call <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float> %a1, <8 x float> %a2, i8 %a0) ; <<8 x float>> [#uses=1] + ret <8 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.mask.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readonly + +define <4 x double> @test_x86_mask_blend_pd_256(i8 %a0, <4 x double> %a1, <4 x double> %a2) { + ; CHECK: vblendmpd %ymm1, %ymm0 + %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a1, <4 x double> %a2, i8 %a0) ; <<4 x double>> [#uses=1] + ret <4 x double> %res +} + +define <4 x double> @test_x86_mask_blend_pd_256_memop(<4 x double> %a, <4 x double>* %ptr, i8 %mask) { + ; CHECK-LABEL: test_x86_mask_blend_pd_256_memop + ; CHECK: vblendmpd (% + %b = load <4 x double>* %ptr + %res = call <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double> %a, <4 x double> %b, i8 %mask) ; <<4 x double>> [#uses=1] + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.avx512.mask.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readonly + +; CHECK-LABEL: test_x86_mask_blend_d_256 +; CHECK: vpblendmd +define <8 x i32> @test_x86_mask_blend_d_256(i8 %a0, <8 x i32> %a1, <8 x i32> %a2) { + %res = call <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32> %a1, <8 x i32> %a2, i8 %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx512.mask.blend.d.256(<8 x i32>, <8 x i32>, i8) nounwind readonly + +define <4 x i64> @test_x86_mask_blend_q_256(i8 %a0, <4 x i64> %a1, <4 x i64> %a2) { + ; CHECK: vpblendmq + %res = call <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64> %a1, <4 x i64> %a2, i8 %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx512.mask.blend.q.256(<4 x i64>, <4 x i64>, i8) nounwind readonly + +define <4 x float> @test_x86_mask_blend_ps_128(i8 %a0, <4 x float> %a1, <4 x float> %a2) { + ; CHECK: vblendmps %xmm1, %xmm0 + %res = call <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float> %a1, <4 x float> %a2, i8 %a0) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.blend.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly + +define <2 x double> @test_x86_mask_blend_pd_128(i8 %a0, <2 x double> %a1, <2 x double> %a2) { + ; CHECK: vblendmpd %xmm1, %xmm0 + %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a1, <2 x double> %a2, i8 %a0) ; <<2 x double>> [#uses=1] + ret <2 x double> %res +} + +define <2 x double> @test_x86_mask_blend_pd_128_memop(<2 x double> %a, <2 x double>* %ptr, i8 %mask) { + ; CHECK-LABEL: test_x86_mask_blend_pd_128_memop + ; CHECK: vblendmpd (% + %b = load <2 x double>* %ptr + %res = call <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double> %a, <2 x double> %b, i8 %mask) ; <<2 x double>> [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.avx512.mask.blend.pd.128(<2 x double>, <2 x double>, i8) nounwind readonly + +define <4 x i32> @test_x86_mask_blend_d_128(i8 %a0, <4 x i32> %a1, <4 x i32> %a2) { + ; CHECK: vpblendmd + %res = call <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32> %a1, <4 x i32> %a2, i8 %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx512.mask.blend.d.128(<4 x i32>, <4 x i32>, i8) nounwind readonly + +define <2 x i64> @test_x86_mask_blend_q_128(i8 %a0, <2 x i64> %a1, <2 x i64> %a2) { + ; CHECK: vpblendmq + %res = call <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64> %a1, <2 x i64> %a2, i8 %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.avx512.mask.blend.q.128(<2 x i64>, <2 x i64>, i8) nounwind readonly diff --git a/test/CodeGen/X86/avx512vl-logic.ll b/test/CodeGen/X86/avx512vl-logic.ll new file mode 100644 index 000000000000..02cb8f978656 --- /dev/null +++ b/test/CodeGen/X86/avx512vl-logic.ll @@ -0,0 +1,137 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl | FileCheck %s + +; 256-bit + +; CHECK-LABEL: vpandd256 +; CHECK: vpandd %ymm +; CHECK: ret +define <8 x i32> @vpandd256(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %x = and <8 x i32> %a2, %b + ret <8 x i32> %x +} + +; CHECK-LABEL: vpord256 +; CHECK: vpord %ymm +; CHECK: ret +define <8 x i32> @vpord256(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %x = or <8 x i32> %a2, %b + ret <8 x i32> %x +} + +; CHECK-LABEL: vpxord256 +; CHECK: vpxord %ymm +; CHECK: ret +define <8 x i32> @vpxord256(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %x = xor <8 x i32> %a2, %b + ret <8 x i32> %x +} + +; CHECK-LABEL: vpandq256 +; CHECK: vpandq %ymm +; CHECK: ret +define <4 x i64> @vpandq256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> + %x = and <4 x i64> %a2, %b + ret <4 x i64> %x +} + +; CHECK-LABEL: vporq256 +; CHECK: vporq %ymm +; CHECK: ret +define <4 x i64> @vporq256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> + %x = or <4 x i64> %a2, %b + ret <4 x i64> %x +} + +; CHECK-LABEL: vpxorq256 +; CHECK: vpxorq %ymm +; CHECK: ret +define <4 x i64> @vpxorq256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> + %x = xor <4 x i64> %a2, %b + ret <4 x i64> %x +} + +; 128-bit + +; CHECK-LABEL: vpandd128 +; CHECK: vpandd %xmm +; CHECK: ret +define <4 x i32> @vpandd128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1> + %x = and <4 x i32> %a2, %b + ret <4 x i32> %x +} + +; CHECK-LABEL: vpord128 +; CHECK: vpord %xmm +; CHECK: ret +define <4 x i32> @vpord128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1> + %x = or <4 x i32> %a2, %b + ret <4 x i32> %x +} + +; CHECK-LABEL: vpxord128 +; CHECK: vpxord %xmm +; CHECK: ret +define <4 x i32> @vpxord128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <4 x i32> %a, <i32 1, i32 1, i32 1, i32 1> + %x = xor <4 x i32> %a2, %b + ret <4 x i32> %x +} + +; CHECK-LABEL: vpandq128 +; CHECK: vpandq %xmm +; CHECK: ret +define <2 x i64> @vpandq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <2 x i64> %a, <i64 1, i64 1> + %x = and <2 x i64> %a2, %b + ret <2 x i64> %x +} + +; CHECK-LABEL: vporq128 +; CHECK: vporq %xmm +; CHECK: ret +define <2 x i64> @vporq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <2 x i64> %a, <i64 1, i64 1> + %x = or <2 x i64> %a2, %b + ret <2 x i64> %x +} + +; CHECK-LABEL: vpxorq128 +; CHECK: vpxorq %xmm +; CHECK: ret +define <2 x i64> @vpxorq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp { +entry: + ; Force the execution domain with an add. + %a2 = add <2 x i64> %a, <i64 1, i64 1> + %x = xor <2 x i64> %a2, %b + ret <2 x i64> %x +} diff --git a/test/CodeGen/X86/avx512vl-mov.ll b/test/CodeGen/X86/avx512vl-mov.ll new file mode 100644 index 000000000000..32246568ac2e --- /dev/null +++ b/test/CodeGen/X86/avx512vl-mov.ll @@ -0,0 +1,642 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s + +; CHECK-LABEL: test_256_1 +; CHECK: vmovdqu32 +; CHECK: ret +define <8 x i32> @test_256_1(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x i32>* + %res = load <8 x i32>* %vaddr, align 1 + ret <8 x i32>%res +} + +; CHECK-LABEL: test_256_2 +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test_256_2(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x i32>* + %res = load <8 x i32>* %vaddr, align 32 + ret <8 x i32>%res +} + +; CHECK-LABEL: test_256_3 +; CHECK: vmovdqa64 +; CHECK: ret +define void @test_256_3(i8 * %addr, <4 x i64> %data) { + %vaddr = bitcast i8* %addr to <4 x i64>* + store <4 x i64>%data, <4 x i64>* %vaddr, align 32 + ret void +} + +; CHECK-LABEL: test_256_4 +; CHECK: vmovdqu32 +; CHECK: ret +define void @test_256_4(i8 * %addr, <8 x i32> %data) { + %vaddr = bitcast i8* %addr to <8 x i32>* + store <8 x i32>%data, <8 x i32>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_256_5 +; CHECK: vmovdqa32 +; CHECK: ret +define void @test_256_5(i8 * %addr, <8 x i32> %data) { + %vaddr = bitcast i8* %addr to <8 x i32>* + store <8 x i32>%data, <8 x i32>* %vaddr, align 32 + ret void +} + +; CHECK-LABEL: test_256_6 +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test_256_6(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x i64>* + %res = load <4 x i64>* %vaddr, align 32 + ret <4 x i64>%res +} + +; CHECK-LABEL: test_256_7 +; CHECK: vmovdqu64 +; CHECK: ret +define void @test_256_7(i8 * %addr, <4 x i64> %data) { + %vaddr = bitcast i8* %addr to <4 x i64>* + store <4 x i64>%data, <4 x i64>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_256_8 +; CHECK: vmovdqu64 +; CHECK: ret +define <4 x i64> @test_256_8(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x i64>* + %res = load <4 x i64>* %vaddr, align 1 + ret <4 x i64>%res +} + +; CHECK-LABEL: test_256_9 +; CHECK: vmovapd {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_256_9(i8 * %addr, <4 x double> %data) { + %vaddr = bitcast i8* %addr to <4 x double>* + store <4 x double>%data, <4 x double>* %vaddr, align 32 + ret void +} + +; CHECK-LABEL: test_256_10 +; CHECK: vmovapd {{.*}} ## encoding: [0x62 +; CHECK: ret +define <4 x double> @test_256_10(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x double>* + %res = load <4 x double>* %vaddr, align 32 + ret <4 x double>%res +} + +; CHECK-LABEL: test_256_11 +; CHECK: vmovaps {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_256_11(i8 * %addr, <8 x float> %data) { + %vaddr = bitcast i8* %addr to <8 x float>* + store <8 x float>%data, <8 x float>* %vaddr, align 32 + ret void +} + +; CHECK-LABEL: test_256_12 +; CHECK: vmovaps {{.*}} ## encoding: [0x62 +; CHECK: ret +define <8 x float> @test_256_12(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x float>* + %res = load <8 x float>* %vaddr, align 32 + ret <8 x float>%res +} + +; CHECK-LABEL: test_256_13 +; CHECK: vmovupd {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_256_13(i8 * %addr, <4 x double> %data) { + %vaddr = bitcast i8* %addr to <4 x double>* + store <4 x double>%data, <4 x double>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_256_14 +; CHECK: vmovupd {{.*}} ## encoding: [0x62 +; CHECK: ret +define <4 x double> @test_256_14(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x double>* + %res = load <4 x double>* %vaddr, align 1 + ret <4 x double>%res +} + +; CHECK-LABEL: test_256_15 +; CHECK: vmovups {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_256_15(i8 * %addr, <8 x float> %data) { + %vaddr = bitcast i8* %addr to <8 x float>* + store <8 x float>%data, <8 x float>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_256_16 +; CHECK: vmovups {{.*}} ## encoding: [0x62 +; CHECK: ret +define <8 x float> @test_256_16(i8 * %addr) { + %vaddr = bitcast i8* %addr to <8 x float>* + %res = load <8 x float>* %vaddr, align 1 + ret <8 x float>%res +} + +; CHECK-LABEL: test_256_17 +; CHECK: vmovdqa32{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>* %vaddr, align 32 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old + ret <8 x i32>%res +} + +; CHECK-LABEL: test_256_18 +; CHECK: vmovdqu32{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> %old + ret <8 x i32>%res +} + +; CHECK-LABEL: test_256_19 +; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>* %vaddr, align 32 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer + ret <8 x i32>%res +} + +; CHECK-LABEL: test_256_20 +; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) { + %mask = icmp ne <8 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x i32>* + %r = load <8 x i32>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x i32> %r, <8 x i32> zeroinitializer + ret <8 x i32>%res +} + +; CHECK-LABEL: test_256_21 +; CHECK: vmovdqa64{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>* %vaddr, align 32 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old + ret <4 x i64>%res +} + +; CHECK-LABEL: test_256_22 +; CHECK: vmovdqu64{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> %old + ret <4 x i64>%res +} + +; CHECK-LABEL: test_256_23 +; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>* %vaddr, align 32 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer + ret <4 x i64>%res +} + +; CHECK-LABEL: test_256_24 +; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i64>* + %r = load <4 x i64>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i64> %r, <4 x i64> zeroinitializer + ret <4 x i64>%res +} + +; CHECK-LABEL: test_256_25 +; CHECK: vmovaps{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1) { + %mask = fcmp one <8 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x float>* + %r = load <8 x float>* %vaddr, align 32 + %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> %old + ret <8 x float>%res +} + +; CHECK-LABEL: test_256_26 +; CHECK: vmovups{{.*{%k[1-7]} }} +; CHECK: ret +define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1) { + %mask = fcmp one <8 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x float>* + %r = load <8 x float>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> %old + ret <8 x float>%res +} + +; CHECK-LABEL: test_256_27 +; CHECK: vmovaps{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x float> @test_256_27(i8 * %addr, <8 x float> %mask1) { + %mask = fcmp one <8 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x float>* + %r = load <8 x float>* %vaddr, align 32 + %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> zeroinitializer + ret <8 x float>%res +} + +; CHECK-LABEL: test_256_28 +; CHECK: vmovups{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <8 x float> @test_256_28(i8 * %addr, <8 x float> %mask1) { + %mask = fcmp one <8 x float> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <8 x float>* + %r = load <8 x float>* %vaddr, align 1 + %res = select <8 x i1> %mask, <8 x float> %r, <8 x float> zeroinitializer + ret <8 x float>%res +} + +; CHECK-LABEL: test_256_29 +; CHECK: vmovapd{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x double>* + %r = load <4 x double>* %vaddr, align 32 + %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> %old + ret <4 x double>%res +} + +; CHECK-LABEL: test_256_30 +; CHECK: vmovupd{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x double>* + %r = load <4 x double>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> %old + ret <4 x double>%res +} + +; CHECK-LABEL: test_256_31 +; CHECK: vmovapd{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x double>* + %r = load <4 x double>* %vaddr, align 32 + %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> zeroinitializer + ret <4 x double>%res +} + +; CHECK-LABEL: test_256_32 +; CHECK: vmovupd{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) { + %mask = icmp ne <4 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x double>* + %r = load <4 x double>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x double> %r, <4 x double> zeroinitializer + ret <4 x double>%res +} + +; CHECK-LABEL: test_128_1 +; CHECK: vmovdqu32 +; CHECK: ret +define <4 x i32> @test_128_1(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x i32>* + %res = load <4 x i32>* %vaddr, align 1 + ret <4 x i32>%res +} + +; CHECK-LABEL: test_128_2 +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test_128_2(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x i32>* + %res = load <4 x i32>* %vaddr, align 16 + ret <4 x i32>%res +} + +; CHECK-LABEL: test_128_3 +; CHECK: vmovdqa64 +; CHECK: ret +define void @test_128_3(i8 * %addr, <2 x i64> %data) { + %vaddr = bitcast i8* %addr to <2 x i64>* + store <2 x i64>%data, <2 x i64>* %vaddr, align 16 + ret void +} + +; CHECK-LABEL: test_128_4 +; CHECK: vmovdqu32 +; CHECK: ret +define void @test_128_4(i8 * %addr, <4 x i32> %data) { + %vaddr = bitcast i8* %addr to <4 x i32>* + store <4 x i32>%data, <4 x i32>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_128_5 +; CHECK: vmovdqa32 +; CHECK: ret +define void @test_128_5(i8 * %addr, <4 x i32> %data) { + %vaddr = bitcast i8* %addr to <4 x i32>* + store <4 x i32>%data, <4 x i32>* %vaddr, align 16 + ret void +} + +; CHECK-LABEL: test_128_6 +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test_128_6(i8 * %addr) { + %vaddr = bitcast i8* %addr to <2 x i64>* + %res = load <2 x i64>* %vaddr, align 16 + ret <2 x i64>%res +} + +; CHECK-LABEL: test_128_7 +; CHECK: vmovdqu64 +; CHECK: ret +define void @test_128_7(i8 * %addr, <2 x i64> %data) { + %vaddr = bitcast i8* %addr to <2 x i64>* + store <2 x i64>%data, <2 x i64>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_128_8 +; CHECK: vmovdqu64 +; CHECK: ret +define <2 x i64> @test_128_8(i8 * %addr) { + %vaddr = bitcast i8* %addr to <2 x i64>* + %res = load <2 x i64>* %vaddr, align 1 + ret <2 x i64>%res +} + +; CHECK-LABEL: test_128_9 +; CHECK: vmovapd {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_128_9(i8 * %addr, <2 x double> %data) { + %vaddr = bitcast i8* %addr to <2 x double>* + store <2 x double>%data, <2 x double>* %vaddr, align 16 + ret void +} + +; CHECK-LABEL: test_128_10 +; CHECK: vmovapd {{.*}} ## encoding: [0x62 +; CHECK: ret +define <2 x double> @test_128_10(i8 * %addr) { + %vaddr = bitcast i8* %addr to <2 x double>* + %res = load <2 x double>* %vaddr, align 16 + ret <2 x double>%res +} + +; CHECK-LABEL: test_128_11 +; CHECK: vmovaps {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_128_11(i8 * %addr, <4 x float> %data) { + %vaddr = bitcast i8* %addr to <4 x float>* + store <4 x float>%data, <4 x float>* %vaddr, align 16 + ret void +} + +; CHECK-LABEL: test_128_12 +; CHECK: vmovaps {{.*}} ## encoding: [0x62 +; CHECK: ret +define <4 x float> @test_128_12(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x float>* + %res = load <4 x float>* %vaddr, align 16 + ret <4 x float>%res +} + +; CHECK-LABEL: test_128_13 +; CHECK: vmovupd {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_128_13(i8 * %addr, <2 x double> %data) { + %vaddr = bitcast i8* %addr to <2 x double>* + store <2 x double>%data, <2 x double>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_128_14 +; CHECK: vmovupd {{.*}} ## encoding: [0x62 +; CHECK: ret +define <2 x double> @test_128_14(i8 * %addr) { + %vaddr = bitcast i8* %addr to <2 x double>* + %res = load <2 x double>* %vaddr, align 1 + ret <2 x double>%res +} + +; CHECK-LABEL: test_128_15 +; CHECK: vmovups {{.*}} ## encoding: [0x62 +; CHECK: ret +define void @test_128_15(i8 * %addr, <4 x float> %data) { + %vaddr = bitcast i8* %addr to <4 x float>* + store <4 x float>%data, <4 x float>* %vaddr, align 1 + ret void +} + +; CHECK-LABEL: test_128_16 +; CHECK: vmovups {{.*}} ## encoding: [0x62 +; CHECK: ret +define <4 x float> @test_128_16(i8 * %addr) { + %vaddr = bitcast i8* %addr to <4 x float>* + %res = load <4 x float>* %vaddr, align 1 + ret <4 x float>%res +} + +; CHECK-LABEL: test_128_17 +; CHECK: vmovdqa32{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>* %vaddr, align 16 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old + ret <4 x i32>%res +} + +; CHECK-LABEL: test_128_18 +; CHECK: vmovdqu32{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> %old + ret <4 x i32>%res +} + +; CHECK-LABEL: test_128_19 +; CHECK: vmovdqa32{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>* %vaddr, align 16 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer + ret <4 x i32>%res +} + +; CHECK-LABEL: test_128_20 +; CHECK: vmovdqu32{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x i32>* + %r = load <4 x i32>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x i32> %r, <4 x i32> zeroinitializer + ret <4 x i32>%res +} + +; CHECK-LABEL: test_128_21 +; CHECK: vmovdqa64{{.*{%k[1-7]} }} +; CHECK: ret +define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>* %vaddr, align 16 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old + ret <2 x i64>%res +} + +; CHECK-LABEL: test_128_22 +; CHECK: vmovdqu64{{.*{%k[1-7]} }} +; CHECK: ret +define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>* %vaddr, align 1 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> %old + ret <2 x i64>%res +} + +; CHECK-LABEL: test_128_23 +; CHECK: vmovdqa64{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>* %vaddr, align 16 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer + ret <2 x i64>%res +} + +; CHECK-LABEL: test_128_24 +; CHECK: vmovdqu64{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x i64>* + %r = load <2 x i64>* %vaddr, align 1 + %res = select <2 x i1> %mask, <2 x i64> %r, <2 x i64> zeroinitializer + ret <2 x i64>%res +} + +; CHECK-LABEL: test_128_25 +; CHECK: vmovaps{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x float>* + %r = load <4 x float>* %vaddr, align 16 + %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> %old + ret <4 x float>%res +} + +; CHECK-LABEL: test_128_26 +; CHECK: vmovups{{.*{%k[1-7]} }} +; CHECK: ret +define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x float>* + %r = load <4 x float>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> %old + ret <4 x float>%res +} + +; CHECK-LABEL: test_128_27 +; CHECK: vmovaps{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x float>* + %r = load <4 x float>* %vaddr, align 16 + %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> zeroinitializer + ret <4 x float>%res +} + +; CHECK-LABEL: test_128_28 +; CHECK: vmovups{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) { + %mask = icmp ne <4 x i32> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <4 x float>* + %r = load <4 x float>* %vaddr, align 1 + %res = select <4 x i1> %mask, <4 x float> %r, <4 x float> zeroinitializer + ret <4 x float>%res +} + +; CHECK-LABEL: test_128_29 +; CHECK: vmovapd{{.*{%k[1-7]} }} +; CHECK: ret +define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x double>* + %r = load <2 x double>* %vaddr, align 16 + %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> %old + ret <2 x double>%res +} + +; CHECK-LABEL: test_128_30 +; CHECK: vmovupd{{.*{%k[1-7]} }} +; CHECK: ret +define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x double>* + %r = load <2 x double>* %vaddr, align 1 + %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> %old + ret <2 x double>%res +} + +; CHECK-LABEL: test_128_31 +; CHECK: vmovapd{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x double>* + %r = load <2 x double>* %vaddr, align 16 + %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> zeroinitializer + ret <2 x double>%res +} + +; CHECK-LABEL: test_128_32 +; CHECK: vmovupd{{.*{%k[1-7]} {z} }} +; CHECK: ret +define <2 x double> @test_128_32(i8 * %addr, <2 x i64> %mask1) { + %mask = icmp ne <2 x i64> %mask1, zeroinitializer + %vaddr = bitcast i8* %addr to <2 x double>* + %r = load <2 x double>* %vaddr, align 1 + %res = select <2 x i1> %mask, <2 x double> %r, <2 x double> zeroinitializer + ret <2 x double>%res +} + diff --git a/test/CodeGen/X86/avx512vl-nontemporal.ll b/test/CodeGen/X86/avx512vl-nontemporal.ll new file mode 100644 index 000000000000..fdafb35807e4 --- /dev/null +++ b/test/CodeGen/X86/avx512vl-nontemporal.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s + +define void @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE) { +; CHECK: vmovntps %ymm{{.*}} ## encoding: [0x62 + %cast = bitcast i8* %B to <8 x float>* + %A2 = fadd <8 x float> %A, %AA + store <8 x float> %A2, <8 x float>* %cast, align 64, !nontemporal !0 +; CHECK: vmovntdq %ymm{{.*}} ## encoding: [0x62 + %cast1 = bitcast i8* %B to <4 x i64>* + %E2 = add <4 x i64> %E, %EE + store <4 x i64> %E2, <4 x i64>* %cast1, align 64, !nontemporal !0 +; CHECK: vmovntpd %ymm{{.*}} ## encoding: [0x62 + %cast2 = bitcast i8* %B to <4 x double>* + %C2 = fadd <4 x double> %C, %CC + store <4 x double> %C2, <4 x double>* %cast2, align 64, !nontemporal !0 + ret void +} + +define void @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE) { +; CHECK: vmovntps %xmm{{.*}} ## encoding: [0x62 + %cast = bitcast i8* %B to <4 x float>* + %A2 = fadd <4 x float> %A, %AA + store <4 x float> %A2, <4 x float>* %cast, align 64, !nontemporal !0 +; CHECK: vmovntdq %xmm{{.*}} ## encoding: [0x62 + %cast1 = bitcast i8* %B to <2 x i64>* + %E2 = add <2 x i64> %E, %EE + store <2 x i64> %E2, <2 x i64>* %cast1, align 64, !nontemporal !0 +; CHECK: vmovntpd %xmm{{.*}} ## encoding: [0x62 + %cast2 = bitcast i8* %B to <2 x double>* + %C2 = fadd <2 x double> %C, %CC + store <2 x double> %C2, <2 x double>* %cast2, align 64, !nontemporal !0 + ret void +} +!0 = !{i32 1} diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll new file mode 100644 index 000000000000..b6b508559ca3 --- /dev/null +++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll @@ -0,0 +1,381 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s + +; CHECK-LABEL: test256_1 +; CHECK: vpcmpeqq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind { + %mask = icmp eq <4 x i64> %x, %y + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y + ret <4 x i64> %max +} + +; CHECK-LABEL: test256_2 +; CHECK: vpcmpgtq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind { + %mask = icmp sgt <4 x i64> %x, %y + %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y + ret <4 x i64> %max +} + +; CHECK-LABEL: @test256_3 +; CHECK: vpcmpled {{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind { + %mask = icmp sge <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y + ret <8 x i32> %max +} + +; CHECK-LABEL: test256_4 +; CHECK: vpcmpnleuq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind { + %mask = icmp ugt <4 x i64> %x, %y + %max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y + ret <4 x i64> %max +} + +; CHECK-LABEL: test256_5 +; CHECK: vpcmpeqd (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind { + %y = load <8 x i32>* %yp, align 4 + %mask = icmp eq <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_6 +; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { + %y = load <8 x i32>* %y.ptr, align 4 + %mask = icmp sgt <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_7 +; CHECK: vpcmpled (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { + %y = load <8 x i32>* %y.ptr, align 4 + %mask = icmp sle <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_8 +; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind { + %y = load <8 x i32>* %y.ptr, align 4 + %mask = icmp ule <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_9 +; CHECK: vpcmpeqd %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> %y1) nounwind { + %mask1 = icmp eq <8 x i32> %x1, %y1 + %mask0 = icmp eq <8 x i32> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y + ret <8 x i32> %max +} + +; CHECK-LABEL: @test256_10 +; CHECK: vpcmpleq %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) nounwind { + %mask1 = icmp sge <4 x i64> %x1, %y1 + %mask0 = icmp sle <4 x i64> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1 + ret <4 x i64> %max +} + +; CHECK-LABEL: @test256_11 +; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind { + %mask1 = icmp sgt <4 x i64> %x1, %y1 + %y = load <4 x i64>* %y.ptr, align 4 + %mask0 = icmp sgt <4 x i64> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1 + ret <4 x i64> %max +} + +; CHECK-LABEL: @test256_12 +; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind { + %mask1 = icmp sge <8 x i32> %x1, %y1 + %y = load <8 x i32>* %y.ptr, align 4 + %mask0 = icmp ule <8 x i32> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: test256_13 +; CHECK: vpcmpeqq (%rdi){1to4}, %ymm +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind { + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0 + %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer + %mask = icmp eq <4 x i64> %x, %y + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1 + ret <4 x i64> %max +} + +; CHECK-LABEL: test256_14 +; CHECK: vpcmpled (%rdi){1to8}, %ymm +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind { + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer + %mask = icmp sle <8 x i32> %x, %y + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: test256_15 +; CHECK: vpcmpgtd (%rdi){1to8}, %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind { + %mask1 = icmp sge <8 x i32> %x1, %y1 + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <8 x i32> %y.0, <8 x i32> undef, <8 x i32> zeroinitializer + %mask0 = icmp sgt <8 x i32> %x, %y + %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer + %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %x1 + ret <8 x i32> %max +} + +; CHECK-LABEL: test256_16 +; CHECK: vpcmpgtq (%rdi){1to4}, %ymm{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind { + %mask1 = icmp sge <4 x i64> %x1, %y1 + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0 + %y = shufflevector <4 x i64> %y.0, <4 x i64> undef, <4 x i32> zeroinitializer + %mask0 = icmp sgt <4 x i64> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %x1 + ret <4 x i64> %max +} + +; CHECK-LABEL: test128_1 +; CHECK: vpcmpeqq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind { + %mask = icmp eq <2 x i64> %x, %y + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y + ret <2 x i64> %max +} + +; CHECK-LABEL: test128_2 +; CHECK: vpcmpgtq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind { + %mask = icmp sgt <2 x i64> %x, %y + %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y + ret <2 x i64> %max +} + +; CHECK-LABEL: @test128_3 +; CHECK: vpcmpled {{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind { + %mask = icmp sge <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y + ret <4 x i32> %max +} + +; CHECK-LABEL: test128_4 +; CHECK: vpcmpnleuq {{.*%k[0-7]}} +; CHECK: vmovdqa64 {{.*}}%k1 +; CHECK: ret +define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind { + %mask = icmp ugt <2 x i64> %x, %y + %max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y + ret <2 x i64> %max +} + +; CHECK-LABEL: test128_5 +; CHECK: vpcmpeqd (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind { + %y = load <4 x i32>* %yp, align 4 + %mask = icmp eq <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_6 +; CHECK: vpcmpgtd (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { + %y = load <4 x i32>* %y.ptr, align 4 + %mask = icmp sgt <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_7 +; CHECK: vpcmpled (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { + %y = load <4 x i32>* %y.ptr, align 4 + %mask = icmp sle <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_8 +; CHECK: vpcmpleud (%rdi){{.*%k[0-7]}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind { + %y = load <4 x i32>* %y.ptr, align 4 + %mask = icmp ule <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_9 +; CHECK: vpcmpeqd %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> %y1) nounwind { + %mask1 = icmp eq <4 x i32> %x1, %y1 + %mask0 = icmp eq <4 x i32> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y + ret <4 x i32> %max +} + +; CHECK-LABEL: @test128_10 +; CHECK: vpcmpleq %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) nounwind { + %mask1 = icmp sge <2 x i64> %x1, %y1 + %mask0 = icmp sle <2 x i64> %x, %y + %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1 + ret <2 x i64> %max +} + +; CHECK-LABEL: @test128_11 +; CHECK: vpcmpgtq (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind { + %mask1 = icmp sgt <2 x i64> %x1, %y1 + %y = load <2 x i64>* %y.ptr, align 4 + %mask0 = icmp sgt <2 x i64> %x, %y + %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1 + ret <2 x i64> %max +} + +; CHECK-LABEL: @test128_12 +; CHECK: vpcmpleud (%rdi){{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind { + %mask1 = icmp sge <4 x i32> %x1, %y1 + %y = load <4 x i32>* %y.ptr, align 4 + %mask0 = icmp ule <4 x i32> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: test128_13 +; CHECK: vpcmpeqq (%rdi){1to2}, %xmm +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind { + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0 + %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1 + %mask = icmp eq <2 x i64> %x, %y + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1 + ret <2 x i64> %max +} + +; CHECK-LABEL: test128_14 +; CHECK: vpcmpled (%rdi){1to4}, %xmm +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind { + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer + %mask = icmp sle <4 x i32> %x, %y + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: test128_15 +; CHECK: vpcmpgtd (%rdi){1to4}, %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa32 +; CHECK: ret +define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind { + %mask1 = icmp sge <4 x i32> %x1, %y1 + %yb = load i32* %yb.ptr, align 4 + %y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0 + %y = shufflevector <4 x i32> %y.0, <4 x i32> undef, <4 x i32> zeroinitializer + %mask0 = icmp sgt <4 x i32> %x, %y + %mask = select <4 x i1> %mask0, <4 x i1> %mask1, <4 x i1> zeroinitializer + %max = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %x1 + ret <4 x i32> %max +} + +; CHECK-LABEL: test128_16 +; CHECK: vpcmpgtq (%rdi){1to2}, %xmm{{.*{%k[1-7]}}} +; CHECK: vmovdqa64 +; CHECK: ret +define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind { + %mask1 = icmp sge <2 x i64> %x1, %y1 + %yb = load i64* %yb.ptr, align 4 + %y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0 + %y = insertelement <2 x i64> %y.0, i64 %yb, i32 1 + %mask0 = icmp sgt <2 x i64> %x, %y + %mask = select <2 x i1> %mask0, <2 x i1> %mask1, <2 x i1> zeroinitializer + %max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %x1 + ret <2 x i64> %max +} diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll deleted file mode 100644 index 34aaf2c31ace..000000000000 --- a/test/CodeGen/X86/blend-msb.ll +++ /dev/null @@ -1,40 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s - - -; Verify that we produce movss instead of blendvps when possible. - -;CHECK-LABEL: vsel_float: -;CHECK-NOT: blend -;CHECK: movss -;CHECK: ret -define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2 - ret <4 x float> %vsel -} - -;CHECK-LABEL: vsel_4xi8: -;CHECK-NOT: blend -;CHECK: movss -;CHECK: ret -define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2 - ret <4 x i8> %vsel -} - -;CHECK-LABEL: vsel_8xi16: -; The select mask is -; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false> -; which translates into the boolean mask (big endian representation): -; 00010001 = 17. -; '1' means takes the first argument, '0' means takes the second argument. -; This is the opposite of the intel syntax, thus we expect -; the inverted mask: 11101110 = 238. -; According to the ABI: -; v1 is in xmm0 => first argument is xmm0. -; v2 is in xmm1 => second argument is xmm1. -;CHECK: pblendw $238, %xmm1, %xmm0 -;CHECK: ret -define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) { - %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2 - ret <8 x i16> %vsel -} diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll index 2681c109ef5d..e35be6ae654f 100644 --- a/test/CodeGen/X86/block-placement.ll +++ b/test/CodeGen/X86/block-placement.ll @@ -124,7 +124,7 @@ exit: ret i32 %sum } -!0 = metadata !{metadata !"branch_weights", i32 4, i32 64} +!0 = !{!"branch_weights", i32 4, i32 64} define i32 @test_loop_early_exits(i32 %i, i32* %a) { ; Check that we sink early exit blocks out of loop bodies. @@ -237,44 +237,6 @@ exit: ret i32 %base } -define void @test_loop_rotate_reversed_blocks() { -; This test case (greatly reduced from an Olden bencmark) ensures that the loop -; rotate implementation doesn't assume that loops are laid out in a particular -; order. The first loop will get split into two basic blocks, with the loop -; header coming after the loop latch. -; -; CHECK: test_loop_rotate_reversed_blocks -; CHECK: %entry -; Look for a jump into the middle of the loop, and no branches mid-way. -; CHECK: jmp -; CHECK: %loop1 -; CHECK-NOT: j{{\w*}} .LBB{{.*}} -; CHECK: %loop1 -; CHECK: je - -entry: - %cond1 = load volatile i1* undef - br i1 %cond1, label %loop2.preheader, label %loop1 - -loop1: - call i32 @f() - %cond2 = load volatile i1* undef - br i1 %cond2, label %loop2.preheader, label %loop1 - -loop2.preheader: - call i32 @f() - %cond3 = load volatile i1* undef - br i1 %cond3, label %exit, label %loop2 - -loop2: - call i32 @f() - %cond4 = load volatile i1* undef - br i1 %cond4, label %exit, label %loop2 - -exit: - ret void -} - define i32 @test_loop_align(i32 %i, i32* %a) { ; Check that we provide basic loop body alignment with the block placement ; pass. @@ -544,7 +506,7 @@ if.end: ret void } -!1 = metadata !{metadata !"branch_weights", i32 1000, i32 1} +!1 = !{!"branch_weights", i32 1000, i32 1} declare i32 @f() declare i32 @g() @@ -580,7 +542,7 @@ exit: ret i32 %result } -!2 = metadata !{metadata !"branch_weights", i32 3, i32 1} +!2 = !{!"branch_weights", i32 3, i32 1} declare i32 @__gxx_personality_v0(...) diff --git a/test/CodeGen/X86/break-avx-dep.ll b/test/CodeGen/X86/break-avx-dep.ll deleted file mode 100644 index 210bda136b57..000000000000 --- a/test/CodeGen/X86/break-avx-dep.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s -; -; rdar:15221834 False AVX register dependencies cause 5x slowdown on -; flops-6. Make sure the unused register read by vcvtsi2sdq is zeroed -; to avoid cyclic dependence on a write to the same register in a -; previous iteration. - -; CHECK-LABEL: t1: -; CHECK-LABEL: %loop -; CHECK: vxorps %[[REG:xmm.]], %{{xmm.}}, %{{xmm.}} -; CHECK: vcvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]], %{{xmm.}} -define i64 @t1(i64* nocapture %x, double* nocapture %y) nounwind { -entry: - %vx = load i64* %x - br label %loop -loop: - %i = phi i64 [ 1, %entry ], [ %inc, %loop ] - %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] - %fi = sitofp i64 %i to double - %vy = load double* %y - %fipy = fadd double %fi, %vy - %iipy = fptosi double %fipy to i64 - %s2 = add i64 %s1, %iipy - %inc = add nsw i64 %i, 1 - %exitcond = icmp eq i64 %inc, 156250000 - br i1 %exitcond, label %ret, label %loop -ret: - ret i64 %s2 -} diff --git a/test/CodeGen/X86/break-false-dep.ll b/test/CodeGen/X86/break-false-dep.ll new file mode 100644 index 000000000000..7034fae5e8bd --- /dev/null +++ b/test/CodeGen/X86/break-false-dep.ll @@ -0,0 +1,201 @@ +; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx -mcpu=corei7-avx | FileCheck %s --check-prefix=AVX + +define double @t1(float* nocapture %x) nounwind readonly ssp { +entry: +; SSE-LABEL: t1: +; SSE: movss ([[A0:%rdi|%rcx]]), %xmm0 +; SSE: cvtss2sd %xmm0, %xmm0 + + %0 = load float* %x, align 4 + %1 = fpext float %0 to double + ret double %1 +} + +define float @t2(double* nocapture %x) nounwind readonly ssp optsize { +entry: +; SSE-LABEL: t2: +; SSE: cvtsd2ss ([[A0]]), %xmm0 + %0 = load double* %x, align 8 + %1 = fptrunc double %0 to float + ret float %1 +} + +define float @squirtf(float* %x) nounwind { +entry: +; SSE-LABEL: squirtf: +; SSE: movss ([[A0]]), %xmm0 +; SSE: sqrtss %xmm0, %xmm0 + %z = load float* %x + %t = call float @llvm.sqrt.f32(float %z) + ret float %t +} + +define double @squirt(double* %x) nounwind { +entry: +; SSE-LABEL: squirt: +; SSE: movsd ([[A0]]), %xmm0 +; SSE: sqrtsd %xmm0, %xmm0 + %z = load double* %x + %t = call double @llvm.sqrt.f64(double %z) + ret double %t +} + +define float @squirtf_size(float* %x) nounwind optsize { +entry: +; SSE-LABEL: squirtf_size: +; SSE: sqrtss ([[A0]]), %xmm0 + %z = load float* %x + %t = call float @llvm.sqrt.f32(float %z) + ret float %t +} + +define double @squirt_size(double* %x) nounwind optsize { +entry: +; SSE-LABEL: squirt_size: +; SSE: sqrtsd ([[A0]]), %xmm0 + %z = load double* %x + %t = call double @llvm.sqrt.f64(double %z) + ret double %t +} + +declare float @llvm.sqrt.f32(float) +declare double @llvm.sqrt.f64(double) + +; SSE-LABEL: loopdep1 +; SSE: for.body +; +; This loop contains two cvtsi2ss instructions that update the same xmm +; register. Verify that the execution dependency fix pass breaks those +; dependencies by inserting xorps instructions. +; +; If the register allocator chooses different registers for the two cvtsi2ss +; instructions, they are still dependent on themselves. +; SSE: xorps [[XMM1:%xmm[0-9]+]] +; SSE: , [[XMM1]] +; SSE: cvtsi2ssl %{{.*}}, [[XMM1]] +; SSE: xorps [[XMM2:%xmm[0-9]+]] +; SSE: , [[XMM2]] +; SSE: cvtsi2ssl %{{.*}}, [[XMM2]] +; +define float @loopdep1(i32 %m) nounwind uwtable readnone ssp { +entry: + %tobool3 = icmp eq i32 %m, 0 + br i1 %tobool3, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ] + %s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ] + %s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ] + %n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ] + %conv = sitofp i32 %n.04 to float + %add = fadd float %s1.06, %conv + %conv1 = sitofp i32 %m.addr.07 to float + %add2 = fadd float %s2.05, %conv1 + %inc = add nsw i32 %n.04, 1 + %dec = add nsw i32 %m.addr.07, -1 + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ] + %sub = fsub float %s1.0.lcssa, %s2.0.lcssa + ret float %sub +} + +; rdar:15221834 False AVX register dependencies cause 5x slowdown on +; flops-6. Make sure the unused register read by vcvtsi2sdq is zeroed +; to avoid cyclic dependence on a write to the same register in a +; previous iteration. + +; AVX-LABEL: loopdep2: +; AVX-LABEL: %loop +; AVX: vxorps %[[REG:xmm.]], %{{xmm.}}, %{{xmm.}} +; AVX: vcvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]], %{{xmm.}} +; SSE-LABEL: loopdep2: +; SSE-LABEL: %loop +; SSE: xorps %[[REG:xmm.]], %[[REG]] +; SSE: cvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]] +define i64 @loopdep2(i64* nocapture %x, double* nocapture %y) nounwind { +entry: + %vx = load i64* %x + br label %loop +loop: + %i = phi i64 [ 1, %entry ], [ %inc, %loop ] + %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ] + %fi = sitofp i64 %i to double + %vy = load double* %y + %fipy = fadd double %fi, %vy + %iipy = fptosi double %fipy to i64 + %s2 = add i64 %s1, %iipy + %inc = add nsw i64 %i, 1 + %exitcond = icmp eq i64 %inc, 156250000 + br i1 %exitcond, label %ret, label %loop +ret: + ret i64 %s2 +} + +; This loop contains a cvtsi2sd instruction that has a loop-carried +; false dependency on an xmm that is modified by other scalar instructions +; that follow it in the loop. Additionally, the source of convert is a +; memory operand. Verify the execution dependency fix pass breaks this +; dependency by inserting a xor before the convert. +@x = common global [1024 x double] zeroinitializer, align 16 +@y = common global [1024 x double] zeroinitializer, align 16 +@z = common global [1024 x double] zeroinitializer, align 16 +@w = common global [1024 x double] zeroinitializer, align 16 +@v = common global [1024 x i32] zeroinitializer, align 16 + +define void @loopdep3() { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc14, %entry + %i.025 = phi i32 [ 0, %entry ], [ %inc15, %for.inc14 ] + br label %for.body3 + +for.body3: + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ] + %arrayidx = getelementptr inbounds [1024 x i32]* @v, i64 0, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %conv = sitofp i32 %0 to double + %arrayidx5 = getelementptr inbounds [1024 x double]* @x, i64 0, i64 %indvars.iv + %1 = load double* %arrayidx5, align 8 + %mul = fmul double %conv, %1 + %arrayidx7 = getelementptr inbounds [1024 x double]* @y, i64 0, i64 %indvars.iv + %2 = load double* %arrayidx7, align 8 + %mul8 = fmul double %mul, %2 + %arrayidx10 = getelementptr inbounds [1024 x double]* @z, i64 0, i64 %indvars.iv + %3 = load double* %arrayidx10, align 8 + %mul11 = fmul double %mul8, %3 + %arrayidx13 = getelementptr inbounds [1024 x double]* @w, i64 0, i64 %indvars.iv + store double %mul11, double* %arrayidx13, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.inc14, label %for.body3 + +for.inc14: ; preds = %for.body3 + %inc15 = add nsw i32 %i.025, 1 + %exitcond26 = icmp eq i32 %inc15, 100000 + br i1 %exitcond26, label %for.end16, label %for.cond1.preheader + +for.end16: ; preds = %for.inc14 + ret void + +;SSE-LABEL:@loopdep3 +;SSE: xorps [[XMM0:%xmm[0-9]+]], [[XMM0]] +;SSE-NEXT: cvtsi2sdl {{.*}}, [[XMM0]] +;SSE-NEXT: mulsd {{.*}}, [[XMM0]] +;SSE-NEXT: mulsd {{.*}}, [[XMM0]] +;SSE-NEXT: mulsd {{.*}}, [[XMM0]] +;SSE-NEXT: movsd [[XMM0]], +;AVX-LABEL:@loopdep3 +;AVX: vxorps [[XMM0:%xmm[0-9]+]], [[XMM0]] +;AVX-NEXT: vcvtsi2sdl {{.*}}, [[XMM0]], [[XMM0]] +;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] +;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] +;AVX-NEXT: vmulsd {{.*}}, [[XMM0]], [[XMM0]] +;AVX-NEXT: vmovsd [[XMM0]], +} diff --git a/test/CodeGen/X86/break-sse-dep.ll b/test/CodeGen/X86/break-sse-dep.ll deleted file mode 100644 index 8124d6f52263..000000000000 --- a/test/CodeGen/X86/break-sse-dep.ll +++ /dev/null @@ -1,62 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2 -mcpu=nehalem | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2 -mcpu=nehalem | FileCheck %s - -define double @t1(float* nocapture %x) nounwind readonly ssp { -entry: -; CHECK-LABEL: t1: -; CHECK: movss ([[A0:%rdi|%rcx]]), %xmm0 -; CHECK: cvtss2sd %xmm0, %xmm0 - - %0 = load float* %x, align 4 - %1 = fpext float %0 to double - ret double %1 -} - -define float @t2(double* nocapture %x) nounwind readonly ssp optsize { -entry: -; CHECK-LABEL: t2: -; CHECK: cvtsd2ss ([[A0]]), %xmm0 - %0 = load double* %x, align 8 - %1 = fptrunc double %0 to float - ret float %1 -} - -define float @squirtf(float* %x) nounwind { -entry: -; CHECK-LABEL: squirtf: -; CHECK: movss ([[A0]]), %xmm0 -; CHECK: sqrtss %xmm0, %xmm0 - %z = load float* %x - %t = call float @llvm.sqrt.f32(float %z) - ret float %t -} - -define double @squirt(double* %x) nounwind { -entry: -; CHECK-LABEL: squirt: -; CHECK: sqrtsd ([[A0]]), %xmm0 - %z = load double* %x - %t = call double @llvm.sqrt.f64(double %z) - ret double %t -} - -define float @squirtf_size(float* %x) nounwind optsize { -entry: -; CHECK-LABEL: squirtf_size: -; CHECK: sqrtss ([[A0]]), %xmm0 - %z = load float* %x - %t = call float @llvm.sqrt.f32(float %z) - ret float %t -} - -define double @squirt_size(double* %x) nounwind optsize { -entry: -; CHECK-LABEL: squirt_size: -; CHECK: sqrtsd ([[A0]]), %xmm0 - %z = load double* %x - %t = call double @llvm.sqrt.f64(double %z) - ret double %t -} - -declare float @llvm.sqrt.f32(float) -declare double @llvm.sqrt.f64(double) diff --git a/test/CodeGen/X86/byval-callee-cleanup.ll b/test/CodeGen/X86/byval-callee-cleanup.ll new file mode 100644 index 000000000000..8e059d433446 --- /dev/null +++ b/test/CodeGen/X86/byval-callee-cleanup.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s + +; Previously we would forget to align to stack slot alignment after placing a +; byval argument. Subsequent arguments would align themselves, but if it was +; the last argument, the argument size would not be a multiple of stack slot +; size. This resulted in retl $6 in callee-cleanup functions, as well as subtle +; varargs bugs. + +%struct.Six = type { [6 x i8] } + +define x86_stdcallcc void @f(%struct.Six* byval %a) { + ret void +} +; CHECK-LABEL: _f@8: +; CHECK: retl $8 + +define x86_thiscallcc void @g(i8* %this, %struct.Six* byval %a) { + ret void +} +; CHECK-LABEL: _g: +; CHECK: retl $8 + +define x86_fastcallcc void @h(i32 inreg %x, i32 inreg %y, %struct.Six* byval %a) { + ret void +} +; CHECK-LABEL: @h@16: +; CHECK: retl $8 diff --git a/test/CodeGen/X86/cfi_enforcing.ll b/test/CodeGen/X86/cfi_enforcing.ll new file mode 100644 index 000000000000..bcad8c168f24 --- /dev/null +++ b/test/CodeGen/X86/cfi_enforcing.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple=i386-unknown-linux-gnu -fcfi -cfi-enforcing <%s | FileCheck --check-prefix=X86 %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -fcfi -cfi-enforcing <%s | FileCheck --check-prefix=X86-64 %s + +define void @indirect_fun() unnamed_addr jumptable { + ret void +} + +define i32 @m(void ()* %fun) { + call void ()* %fun() +; CHECK: subl +; X86-64: andq $8, +; X86-64: leaq __llvm_jump_instr_table_0_1({{%[a-z0-9]+}}), [[REG:%[a-z0-9]+]] +; X86-64-NOT: callq __llvm_cfi_pointer_warning +; X86-64: callq *[[REG]] +; X86: andl $8, +; X86: leal __llvm_jump_instr_table_0_1({{%[a-z0-9]+}}), [[REG:%[a-z0-9]+]] +; X86-NOT: calll __llvm_cfi_pointer_warning +; X86: calll *[[REG]] + ret i32 0 +} + +define void ()* @get_fun() { + ret void ()* @indirect_fun +} + +define i32 @main(i32 %argc, i8** %argv) { + %f = call void ()* ()* @get_fun() + %a = call i32 @m(void ()* %f) + ret i32 %a +} + +; CHECK: .align 8 +; CHECK: __llvm_jump_instr_table_0_1: +; CHECK: jmp indirect_fun@PLT diff --git a/test/CodeGen/X86/cfi_invoke.ll b/test/CodeGen/X86/cfi_invoke.ll new file mode 100644 index 000000000000..dd0d42a59c3a --- /dev/null +++ b/test/CodeGen/X86/cfi_invoke.ll @@ -0,0 +1,35 @@ +; RUN: llc <%s -fcfi -cfi-type=sub | FileCheck %s +; ModuleID = 'test.cc' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare i32 @__gxx_personality_v0(...) + +@_ZTIPKc = external constant i8* +@_ZTIi = external constant i8* + +define void @f() unnamed_addr jumptable { + ret void +} + +@a = global void ()* @f + +; Make sure invoke gets targeted as well as regular calls +define void @_Z3foov(void ()* %f) uwtable ssp { +; CHECK-LABEL: _Z3foov: + entry: + invoke void %f() + to label %try.cont unwind label %lpad +; CHECK: callq __llvm_cfi_pointer_warning +; CHECK: callq *%rbx + + lpad: + %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + catch i8* bitcast (i8** @_ZTIi to i8*) + filter [1 x i8*] [i8* bitcast (i8** @_ZTIPKc to i8*)] + ret void + + try.cont: + ret void +} + diff --git a/test/CodeGen/X86/cfi_non_default_function.ll b/test/CodeGen/X86/cfi_non_default_function.ll new file mode 100644 index 000000000000..29774a1d4425 --- /dev/null +++ b/test/CodeGen/X86/cfi_non_default_function.ll @@ -0,0 +1,27 @@ +; RUN: llc -fcfi -cfi-func-name=cfi_new_failure <%s | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" +define void @indirect_fun() unnamed_addr jumptable { + ret void +} + +define i32 @m(void ()* %fun) { +; CHECK-LABEL: @m + call void ()* %fun() +; CHECK: callq cfi_new_failure + ret i32 0 +} + +define void ()* @get_fun() { + ret void ()* @indirect_fun +} + +define i32 @main(i32 %argc, i8** %argv) { + %f = call void ()* ()* @get_fun() + %a = call i32 @m(void ()* %f) + ret i32 %a +} + +; CHECK: .align 8 +; CHECK: __llvm_jump_instr_table_0_1: +; CHECK: jmp indirect_fun@PLT diff --git a/test/CodeGen/X86/cfi_simple_indirect_call.ll b/test/CodeGen/X86/cfi_simple_indirect_call.ll new file mode 100644 index 000000000000..0ee118d984ea --- /dev/null +++ b/test/CodeGen/X86/cfi_simple_indirect_call.ll @@ -0,0 +1,43 @@ +; RUN: llc -fcfi -cfi-type=sub <%s | FileCheck --check-prefix=SUB %s +; RUN: llc -fcfi -cfi-type=add <%s | FileCheck --check-prefix=ADD %s +; RUN: llc -fcfi -cfi-type=ror <%s | FileCheck --check-prefix=ROR %s + +target triple = "x86_64-unknown-linux-gnu" + +define void @indirect_fun() unnamed_addr jumptable { + ret void +} + +define i32 @m(void ()* %fun) { + call void ()* %fun() +; SUB: subl +; SUB: andq $8 +; SUB-LABEL: leaq __llvm_jump_instr_table_0_1 +; SUB-LABEL: callq __llvm_cfi_pointer_warning + +; ROR: subq +; ROR: rolq $61 +; ROR: testq +; ROR-LABEL: callq __llvm_cfi_pointer_warning + +; ADD: andq $8 +; ADD-LABEL: leaq __llvm_jump_instr_table_0_1 +; ADD: cmpq +; ADD-LABEL: callq __llvm_cfi_pointer_warning +ret i32 0 +} + +define void ()* @get_fun() { + ret void ()* @indirect_fun +} + +define i32 @main(i32 %argc, i8** %argv) { + %f = call void ()* ()* @get_fun() + %a = call i32 @m(void ()* %f) + ret i32 %a +} +; SUB: .text +; SUB: .align 8 +; SUB-LABEL: .type __llvm_jump_instr_table_0_1,@function +; SUB-LABEL:__llvm_jump_instr_table_0_1: +; SUB-LABEL: jmp indirect_fun@PLT diff --git a/test/CodeGen/X86/chain_order.ll b/test/CodeGen/X86/chain_order.ll index c88726e75a81..72e6f78bdef7 100644 --- a/test/CodeGen/X86/chain_order.ll +++ b/test/CodeGen/X86/chain_order.ll @@ -1,13 +1,13 @@ ; RUN: llc < %s -mcpu=corei7-avx -mtriple=x86_64-linux | FileCheck %s -;CHECK-LABEL: cftx020: -;CHECK: vmovsd (%rdi), %xmm{{.*}} -;CHECK: vmovsd 16(%rdi), %xmm{{.*}} -;CHECK: vmovsd 24(%rdi), %xmm{{.*}} -;CHECK: vmovhpd 8(%rdi), %xmm{{.*}} -;CHECK: vmovupd %xmm{{.*}}, (%rdi) -;CHECK: vmovupd %xmm{{.*}}, 16(%rdi) -;CHECK: ret +; CHECK-LABEL: cftx020: +; CHECK: vmovsd (%rdi), %xmm{{.*}} +; CHECK-NEXT: vmovsd 16(%rdi), %xmm{{.*}} +; CHECK-NEXT: vmovhpd 24(%rdi), %xmm{{.*}} +; CHECK-NEXT: vmovhpd 8(%rdi), %xmm{{.*}} +; CHECK: vmovupd %xmm{{.*}}, (%rdi) +; CHECK-NEXT: vmovupd %xmm{{.*}}, 16(%rdi) +; CHECK: ret ; A test from pifft (after SLP-vectorization) that fails when we drop the chain on newly merged loads. define void @cftx020(double* nocapture %a) { diff --git a/test/CodeGen/X86/clobber-fi0.ll b/test/CodeGen/X86/clobber-fi0.ll index 38a42dbf1aa1..4876c351a413 100644 --- a/test/CodeGen/X86/clobber-fi0.ll +++ b/test/CodeGen/X86/clobber-fi0.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mcpu=generic -mtriple=x86_64-linux | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.7.0" diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll index d38d2b430ccb..355c6b4165b9 100644 --- a/test/CodeGen/X86/cmov.ll +++ b/test/CodeGen/X86/cmov.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -disable-cgp-select2branch | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-apple-darwin10 -disable-cgp-select2branch | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone { diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll new file mode 100644 index 000000000000..b7995dbdf259 --- /dev/null +++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll @@ -0,0 +1,87 @@ +; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=i386-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=x86_64-linux-gnu -pre-RA-sched=fast %s -o - | FileCheck %s + +declare i32 @bar() + +define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) { +; CHECK-LABEL: test_intervening_call: +; CHECK: cmpxchg +; CHECK: pushf[[LQ:[lq]]] +; CHECK-NEXT: pop[[LQ]] [[FLAGS:%.*]] + +; CHECK-NEXT: call[[LQ]] bar + +; CHECK-NEXT: push[[LQ]] [[FLAGS]] +; CHECK-NEXT: popf[[LQ]] +; CHECK-NEXT: jne + %cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst + %p = extractvalue { i64, i1 } %cx, 1 + call i32 @bar() + br i1 %p, label %t, label %f + +t: + ret i64 42 + +f: + ret i64 0 +} + +; Interesting in producing a clobber without any function calls. +define i32 @test_control_flow(i32* %p, i32 %i, i32 %j) { +; CHECK-LABEL: test_control_flow: + +; CHECK: cmpxchg +; CHECK-NEXT: jne +entry: + %cmp = icmp sgt i32 %i, %j + br i1 %cmp, label %loop_start, label %cond.end + +loop_start: + br label %while.condthread-pre-split.i + +while.condthread-pre-split.i: + %.pr.i = load i32* %p, align 4 + br label %while.cond.i + +while.cond.i: + %0 = phi i32 [ %.pr.i, %while.condthread-pre-split.i ], [ 0, %while.cond.i ] + %tobool.i = icmp eq i32 %0, 0 + br i1 %tobool.i, label %while.cond.i, label %while.body.i + +while.body.i: + %.lcssa = phi i32 [ %0, %while.cond.i ] + %1 = cmpxchg i32* %p, i32 %.lcssa, i32 %.lcssa seq_cst seq_cst + %2 = extractvalue { i32, i1 } %1, 1 + br i1 %2, label %cond.end.loopexit, label %while.condthread-pre-split.i + +cond.end.loopexit: + br label %cond.end + +cond.end: + %cond = phi i32 [ %i, %entry ], [ 0, %cond.end.loopexit ] + ret i32 %cond +} + +; This one is an interesting case because CMOV doesn't have a chain +; operand. Naive attempts to limit cmpxchg EFLAGS use are likely to fail here. +define i32 @test_feed_cmov(i32* %addr, i32 %desired, i32 %new) { +; CHECK-LABEL: test_feed_cmov: + +; CHECK: cmpxchg +; CHECK: pushf[[LQ:[lq]]] +; CHECK-NEXT: pop[[LQ]] [[FLAGS:%.*]] + +; CHECK-NEXT: call[[LQ]] bar + +; CHECK-NEXT: push[[LQ]] [[FLAGS]] +; CHECK-NEXT: popf[[LQ]] + %res = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst + %success = extractvalue { i32, i1 } %res, 1 + + %rhs = call i32 @bar() + + %ret = select i1 %success, i32 %new, i32 %rhs + ret i32 %ret +} diff --git a/test/CodeGen/X86/coalesce_commute_subreg.ll b/test/CodeGen/X86/coalesce_commute_subreg.ll new file mode 100644 index 000000000000..8d0a20cfebbd --- /dev/null +++ b/test/CodeGen/X86/coalesce_commute_subreg.ll @@ -0,0 +1,51 @@ +; RUN: llc -mtriple="x86_64-apple-darwin" -o - -verify-machineinstrs %s + +define void @make_wanted() #0 { +entry: + br i1 undef, label %for.end20, label %for.cond1.preheader.lr.ph + +for.cond1.preheader.lr.ph: + br label %for.body3 + +for.body3: + %cmp20.i = icmp eq i32 undef, 0 + %.col.057 = select i1 %cmp20.i, i32 0, i32 undef + br i1 undef, label %while.cond.i, label %for.body5.lr.ph.i + +for.body5.lr.ph.i: + %0 = sext i32 %.col.057 to i64 + %1 = sub i32 0, %.col.057 + %2 = zext i32 %1 to i64 + %3 = add nuw nsw i64 %2, 1 + %n.vec110 = and i64 %3, 8589934588 + %end.idx.rnd.down111 = add nsw i64 %n.vec110, %0 + br i1 undef, label %middle.block105, label %vector.ph103 + +vector.ph103: + br i1 undef, label %middle.block105, label %vector.body104 + +vector.body104: + %4 = icmp eq i64 undef, %end.idx.rnd.down111 + br i1 %4, label %middle.block105, label %vector.body104 + +middle.block105: + %resume.val114 = phi i64 [ %0, %for.body5.lr.ph.i ], [ %end.idx.rnd.down111, %vector.body104 ], [ %end.idx.rnd.down111, %vector.ph103 ] + %cmp.n116 = icmp eq i64 undef, %resume.val114 + br i1 %cmp.n116, label %while.cond.i, label %for.body5.i.preheader + +for.body5.i.preheader: + %lcmp.or182 = or i1 undef, undef + br i1 %lcmp.or182, label %for.body5.i.prol, label %while.cond.i + +for.body5.i.prol: + br i1 undef, label %for.body5.i.prol, label %while.cond.i + +while.cond.i: + br i1 undef, label %while.cond.i, label %if.then + +if.then: + br label %for.body3 + +for.end20: + ret void +} diff --git a/test/CodeGen/X86/coalescer-dce.ll b/test/CodeGen/X86/coalescer-dce.ll index 7f72e3d8667e..208d70660faa 100644 --- a/test/CodeGen/X86/coalescer-dce.ll +++ b/test/CodeGen/X86/coalescer-dce.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -disable-fp-elim -disable-machine-dce -verify-coalescing +; RUN: llc < %s -verify-machineinstrs -disable-fp-elim -disable-machine-dce -verify-coalescing target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-apple-macosx10.7.0" diff --git a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll index 78e1dd287f6e..85bfff2757e6 100644 --- a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll +++ b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll @@ -1,7 +1,7 @@ ; RUN: opt -S -codegenprepare %s -o - | FileCheck %s ; RUN: opt -S -codegenprepare -addr-sink-using-gep=1 %s -o - | FileCheck -check-prefix=CHECK-GEP %s ; This file tests the different cases what are involved when codegen prepare -; tries to get sign extension out of the way of addressing mode. +; tries to get sign/zero extension out of the way of addressing mode. ; This tests require an actual target as addressing mode decisions depends ; on the target. @@ -67,6 +67,43 @@ define i8 @oneArgPromotion(i32 %arg1, i8* %base) { ret i8 %res } +; Check that we are able to merge a sign extension with a zero extension. +; CHECK-LABEL: @oneArgPromotionZExt +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1ZEXT]], 1 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionZExt(i8 %arg1, i8* %base) { + %zext = zext i8 %arg1 to i32 + %add = add nsw i32 %zext, 1 + %sextadd = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd + %res = load i8* %arrayidx + ret i8 %res +} + +; When promoting a constant zext, the IR builder returns a constant, +; not an instruction. Make sure this is properly handled. This used +; to crash. +; Note: The constant zext is promoted, but does not help matching +; more thing in the addressing mode. Therefore the modification is +; rolled back. +; Still, this test case exercises the desired code path. +; CHECK-LABEL: @oneArgPromotionCstZExt +; CHECK: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i16 undef to i32 +; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXT]] to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXT]], 1 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionCstZExt(i8* %base) { + %cst = zext i16 undef to i32 + %add = add nsw i32 %cst, 1 + %sextadd = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd + %res = load i8* %arrayidx + ret i8 %res +} + ; Check that we do not promote truncate when we cannot determine the ; bits that are dropped. ; CHECK-LABEL: @oneArgPromotionBlockTrunc1 @@ -321,3 +358,177 @@ end: %final = load i32* %addr ret i32 %final } + +%struct.dns_packet = type { i32, i32, %union.anon } +%union.anon = type { i32 } + +@a = common global i32 0, align 4 +@b = common global i16 0, align 2 + +; We used to crash on this function because we did not return the right +; promoted instruction for %conv.i. +; Make sure we generate the right code now. +; CHECK-LABEL: @fn3 +; %conv.i is used twice and only one of its use is being promoted. +; Use it at the starting point for the matching. +; CHECK: %conv.i = zext i16 [[PLAIN_OPND:%[.a-zA-Z_0-9-]+]] to i32 +; CHECK-NEXT: [[PROMOTED_CONV:%[.a-zA-Z_0-9-]+]] = zext i16 [[PLAIN_OPND]] to i64 +; CHECK-NEXT: [[BASE:%[a-zA-Z_0-9-]+]] = ptrtoint %struct.dns_packet* %P to i64 +; CHECK-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add i64 [[BASE]], [[PROMOTED_CONV]] +; CHECK-NEXT: [[ADDR:%[a-zA-Z_0-9-]+]] = add i64 [[ADD]], 7 +; CHECK-NEXT: [[CAST:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[ADDR]] to i8* +; CHECK-NEXT: load i8* [[CAST]], align 1 +define signext i16 @fn3(%struct.dns_packet* nocapture readonly %P) { +entry: + %tmp = getelementptr inbounds %struct.dns_packet* %P, i64 0, i32 2 + %data.i.i = bitcast %union.anon* %tmp to [0 x i8]* + br label %while.body.i.i + +while.body.i.i: ; preds = %while.body.i.i, %entry + %src.addr.0.i.i = phi i16 [ 0, %entry ], [ %inc.i.i, %while.body.i.i ] + %inc.i.i = add i16 %src.addr.0.i.i, 1 + %idxprom.i.i = sext i16 %src.addr.0.i.i to i64 + %arrayidx.i.i = getelementptr inbounds [0 x i8]* %data.i.i, i64 0, i64 %idxprom.i.i + %tmp1 = load i8* %arrayidx.i.i, align 1 + %conv2.i.i = zext i8 %tmp1 to i32 + %and.i.i = and i32 %conv2.i.i, 15 + store i32 %and.i.i, i32* @a, align 4 + %tobool.i.i = icmp eq i32 %and.i.i, 0 + br i1 %tobool.i.i, label %while.body.i.i, label %fn1.exit.i + +fn1.exit.i: ; preds = %while.body.i.i + %inc.i.i.lcssa = phi i16 [ %inc.i.i, %while.body.i.i ] + %conv.i = zext i16 %inc.i.i.lcssa to i32 + %sub.i = add nsw i32 %conv.i, -1 + %idxprom.i = sext i32 %sub.i to i64 + %arrayidx.i = getelementptr inbounds [0 x i8]* %data.i.i, i64 0, i64 %idxprom.i + %tmp2 = load i8* %arrayidx.i, align 1 + %conv2.i = sext i8 %tmp2 to i16 + store i16 %conv2.i, i16* @b, align 2 + %sub4.i = sub nsw i32 0, %conv.i + %conv5.i = zext i16 %conv2.i to i32 + %cmp.i = icmp sgt i32 %conv5.i, %sub4.i + br i1 %cmp.i, label %if.then.i, label %fn2.exit + +if.then.i: ; preds = %fn1.exit.i + %end.i = getelementptr inbounds %struct.dns_packet* %P, i64 0, i32 1 + %tmp3 = load i32* %end.i, align 4 + %sub7.i = add i32 %tmp3, 65535 + %conv8.i = trunc i32 %sub7.i to i16 + br label %fn2.exit + +fn2.exit: ; preds = %if.then.i, %fn1.exit.i + %retval.0.i = phi i16 [ %conv8.i, %if.then.i ], [ undef, %fn1.exit.i ] + ret i16 %retval.0.i +} + +; Check that we do not promote an extension if the non-wrapping flag does not +; match the kind of the extension. +; CHECK-LABEL: @noPromotionFlag +; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = zext i32 [[ADD]] to i64 +; CHECK: inttoptr i64 [[PROMOTED]] to i8* +; CHECK: ret +define i8 @noPromotionFlag(i32 %arg1, i32 %arg2) { + %add = add nsw i32 %arg1, %arg2 + %zextadd = zext i32 %add to i64 + %base = inttoptr i64 %zextadd to i8* + %res = load i8* %base + ret i8 %res +} + +; Check that we correctly promote both operands of the promotable add with zext. +; CHECK-LABEL: @twoArgsPromotionZExt +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg1 to i64 +; CHECK: [[ARG2ZEXT:%[a-zA-Z_0-9-]+]] = zext i32 %arg2 to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], [[ARG2ZEXT]] +; CHECK: inttoptr i64 [[PROMOTED]] to i8* +; CHECK: ret +define i8 @twoArgsPromotionZExt(i32 %arg1, i32 %arg2) { + %add = add nuw i32 %arg1, %arg2 + %zextadd = zext i32 %add to i64 + %base = inttoptr i64 %zextadd to i8* + %res = load i8* %base + ret i8 %res +} + +; Check that we correctly promote constant arguments. +; CHECK-LABEL: @oneArgPromotionNegativeCstZExt +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 255 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionNegativeCstZExt(i8 %arg1, i8* %base) { + %add = add nuw i8 %arg1, -1 + %zextadd = zext i8 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd + %res = load i8* %arrayidx + ret i8 %res +} + +; Check that we are able to merge two zero extensions. +; CHECK-LABEL: @oneArgPromotionZExtZExt +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 %arg1 to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionZExtZExt(i8 %arg1, i8* %base) { + %zext = zext i8 %arg1 to i32 + %add = add nuw i32 %zext, 1 + %zextadd = zext i32 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd + %res = load i8* %arrayidx + ret i8 %res +} + +; Check that we do not promote truncate when the dropped bits +; are of a different kind. +; CHECK-LABEL: @oneArgPromotionBlockTruncZExt +; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32 +; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8 +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1TRUNC]] to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionBlockTruncZExt(i1 %arg1, i8* %base) { + %sextarg1 = sext i1 %arg1 to i32 + %trunc = trunc i32 %sextarg1 to i8 + %add = add nuw i8 %trunc, 1 + %zextadd = zext i8 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd + %res = load i8* %arrayidx + ret i8 %res +} + +; Check that we are able to promote truncate when we know all the bits +; that are dropped. +; CHECK-LABEL: @oneArgPromotionPassTruncZExt +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i1 %arg1 to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionPassTruncZExt(i1 %arg1, i8* %base) { + %sextarg1 = zext i1 %arg1 to i32 + %trunc = trunc i32 %sextarg1 to i8 + %add = add nuw i8 %trunc, 1 + %zextadd = zext i8 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd + %res = load i8* %arrayidx + ret i8 %res +} + +; Check that we do not promote sext with zext. +; CHECK-LABEL: @oneArgPromotionBlockSExtZExt +; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i8 +; CHECK: [[ARG1ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[ARG1SEXT]] to i64 +; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ARG1ZEXT]], 1 +; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]] +; CHECK: ret +define i8 @oneArgPromotionBlockSExtZExt(i1 %arg1, i8* %base) { + %sextarg1 = sext i1 %arg1 to i8 + %add = add nuw i8 %sextarg1, 1 + %zextadd = zext i8 %add to i64 + %arrayidx = getelementptr inbounds i8* %base, i64 %zextadd + %res = load i8* %arrayidx + ret i8 %res +} diff --git a/test/CodeGen/X86/codegen-prepare-extload.ll b/test/CodeGen/X86/codegen-prepare-extload.ll index 9320706d9728..9b27c33a80f9 100644 --- a/test/CodeGen/X86/codegen-prepare-extload.ll +++ b/test/CodeGen/X86/codegen-prepare-extload.ll @@ -1,12 +1,21 @@ ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-win64 | FileCheck %s -; rdar://7304838 +; RUN: opt -codegenprepare < %s -mtriple=x86_64-apple-macosx -S | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=NONSTRESS +; RUN: opt -codegenprepare < %s -mtriple=x86_64-apple-macosx -S -stress-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=STRESS +; RUN: opt -codegenprepare < %s -mtriple=x86_64-apple-macosx -S -disable-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=DISABLE +; rdar://7304838 ; CodeGenPrepare should move the zext into the block with the load ; so that SelectionDAG can select it with the load. - +; +; CHECK-LABEL: foo: ; CHECK: movsbl ({{%rdi|%rcx}}), %eax - +; +; OPTALL-LABEL: @foo +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p +; OPTALL-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; OPTALL: store i32 [[ZEXT]], i32* %q +; OPTALL: ret define void @foo(i8* %p, i32* %q) { entry: %t = load i8* %p @@ -19,3 +28,336 @@ true: false: ret void } + +; Check that we manage to form a zextload is an operation with only one +; argument to explicitly extend is in the the way. +; OPTALL-LABEL: @promoteOneArg +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p +; OPT-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT]], 2 +; Make sure the operation is not promoted when the promotion pass is disabled. +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], 2 +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32 +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteOneArg(i8* %p, i32* %q) { +entry: + %t = load i8* %p + %add = add nuw i8 %t, 2 + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = zext i8 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we manage to form a sextload is an operation with only one +; argument to explicitly extend is in the the way. +; Version with sext. +; OPTALL-LABEL: @promoteOneArgSExt +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p +; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32 +; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXT]], 2 +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], 2 +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32 +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteOneArgSExt(i8* %p, i32* %q) { +entry: + %t = load i8* %p + %add = add nsw i8 %t, 2 + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = sext i8 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we manage to form a zextload is an operation with two +; arguments to explicitly extend is in the the way. +; Extending %add will create two extensions: +; 1. One for %b. +; 2. One for %t. +; #1 will not be removed as we do not know anything about %b. +; #2 may not be merged with the load because %t is used in a comparison. +; Since two extensions may be emitted in the end instead of one before the +; transformation, the regular heuristic does not apply the optimization. +; +; OPTALL-LABEL: @promoteTwoArgZext +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p +; +; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32 +; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]] +; +; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b +; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32 +; +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32 +; +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteTwoArgZext(i8* %p, i32* %q, i8 %b) { +entry: + %t = load i8* %p + %add = add nuw i8 %t, %b + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = zext i8 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we manage to form a sextload is an operation with two +; arguments to explicitly extend is in the the way. +; Version with sext. +; OPTALL-LABEL: @promoteTwoArgSExt +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p +; +; STRESS-NEXT: [[SEXTLD:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32 +; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i8 %b to i32 +; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXTLD]], [[SEXTB]] +; +; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b +; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32 +; +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32 +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteTwoArgSExt(i8* %p, i32* %q, i8 %b) { +entry: + %t = load i8* %p + %add = add nsw i8 %t, %b + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = sext i8 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we do not a zextload if we need to introduce more than +; one additional extension. +; OPTALL-LABEL: @promoteThreeArgZext +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p +; +; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32 +; STRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]] +; STRESS-NEXT: [[ZEXTC:%[a-zA-Z_0-9-]+]] = zext i8 %c to i32 +; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[TMP]], [[ZEXTC]] +; +; NONSTRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b +; NONSTRESS-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[TMP]], %c +; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32 +; +; DISABLE: add nuw i8 +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32 +; +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteThreeArgZext(i8* %p, i32* %q, i8 %b, i8 %c) { +entry: + %t = load i8* %p + %tmp = add nuw i8 %t, %b + %add = add nuw i8 %tmp, %c + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = zext i8 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we manage to form a zextload after promoting and merging +; two extensions. +; OPTALL-LABEL: @promoteMergeExtArgZExt +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p +; +; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i16 %b to i32 +; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]] +; +; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16 +; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b +; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32 +; +; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16 +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32 +; +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteMergeExtArgZExt(i8* %p, i32* %q, i16 %b) { +entry: + %t = load i8* %p + %ext = zext i8 %t to i16 + %add = add nuw i16 %ext, %b + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = zext i16 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we manage to form a sextload after promoting and merging +; two extensions. +; Version with sext. +; OPTALL-LABEL: @promoteMergeExtArgSExt +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %p +; +; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32 +; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = sext i16 %b to i32 +; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXTLD]], [[ZEXTB]] +; +; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16 +; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b +; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32 +; +; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16 +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32 +; OPTALL: store i32 [[RES]], i32* %q +; OPTALL: ret +define void @promoteMergeExtArgSExt(i8* %p, i32* %q, i16 %b) { +entry: + %t = load i8* %p + %ext = zext i8 %t to i16 + %add = add nsw i16 %ext, %b + %a = icmp slt i8 %t, 20 + br i1 %a, label %true, label %false +true: + %s = sext i16 %add to i32 + store i32 %s, i32* %q + ret void +false: + ret void +} + +; Check that we manage to catch all the extload opportunities that are exposed +; by the different iterations of codegen prepare. +; Moreover, check that we do not promote more than we need to. +; Here is what is happening in this test (not necessarly in this order): +; 1. We try to promote the operand of %sextadd. +; a. This creates one sext of %ld2 and one of %zextld +; b. The sext of %ld2 can be combine with %ld2, so we remove one sext but +; introduced one. This is fine with the current heuristic: neutral. +; => We have one zext of %zextld left and we created one sext of %ld2. +; 2. We try to promote the operand of %sextaddza. +; a. This creates one sext of %zexta and one of %zextld +; b. The sext of %zexta does not lead to any load, it stays here, even if it +; could have been combine with the zext of %a. +; c. The sext of %zextld leads to %ld and can be combined with it. This is +; done by promoting %zextld. This is fine with the current heuristic: +; neutral. +; => We have created a new zext of %ld and we created one sext of %zexta. +; 3. We try to promote the operand of %sextaddb. +; a. This creates one sext of %b and one of %zextld +; b. The sext of %b is a dead-end, nothing to be done. +; c. Same thing as 2.c. happens. +; => We have created a new zext of %ld and we created one sext of %b. +; 4. We try to promote the operand of the zext of %zextld introduced in #1. +; a. Same thing as 2.c. happens. +; b. %zextld does not have any other uses. It is dead coded. +; => We have created a new zext of %ld and we removed a zext of %zextld and +; a zext of %ld. +; Currently we do not try to reuse existing extensions, so in the end we have +; 3 identical zext of %ld. The extensions will be CSE'ed by SDag. +; +; OPTALL-LABEL: @severalPromotions +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8* %addr1 +; OPT-NEXT: [[ZEXTLD1_1:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; OPT-NEXT: [[ZEXTLD1_2:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64 +; OPT-NEXT: [[LD2:%[a-zA-Z_0-9-]+]] = load i32* %addr2 +; OPT-NEXT: [[SEXTLD2:%[a-zA-Z_0-9-]+]] = sext i32 [[LD2]] to i64 +; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_1]] +; We do not combine this one: see 2.b. +; OPT-NEXT: [[ZEXTA:%[a-zA-Z_0-9-]+]] = zext i8 %a to i32 +; OPT-NEXT: [[SEXTZEXTA:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXTA]] to i64 +; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTZEXTA]], [[ZEXTLD1_3]] +; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64 +; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_2]] +; +; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 +; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64 +; DISABLE: [[ADDZA:%[a-zA-Z_0-9-]+]] = add nsw i32 +; DISABLE: [[RESZA:%[a-zA-Z_0-9-]+]] = sext i32 [[ADDZA]] to i64 +; DISABLE: [[ADDB:%[a-zA-Z_0-9-]+]] = add nsw i32 +; DISABLE: [[RESB:%[a-zA-Z_0-9-]+]] = sext i32 [[ADDB]] to i64 +; +; OPTALL: call void @dummy(i64 [[RES]], i64 [[RESZA]], i64 [[RESB]]) +; OPTALL: ret +define void @severalPromotions(i8* %addr1, i32* %addr2, i8 %a, i32 %b) { + %ld = load i8* %addr1 + %zextld = zext i8 %ld to i32 + %ld2 = load i32* %addr2 + %add = add nsw i32 %ld2, %zextld + %sextadd = sext i32 %add to i64 + %zexta = zext i8 %a to i32 + %addza = add nsw i32 %zexta, %zextld + %sextaddza = sext i32 %addza to i64 + %addb = add nsw i32 %b, %zextld + %sextaddb = sext i32 %addb to i64 + call void @dummy(i64 %sextadd, i64 %sextaddza, i64 %sextaddb) + ret void +} + +declare void @dummy(i64, i64, i64) + +; Make sure we do not try to promote vector types since the type promotion +; helper does not support them for now. +; OPTALL-LABEL: @vectorPromotion +; OPTALL: [[SHL:%[a-zA-Z_0-9-]+]] = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8> +; OPTALL: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext <2 x i32> [[SHL]] to <2 x i64> +; OPTALL: ret +define void @vectorPromotion() { +entry: + %a = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8> + %b = zext <2 x i32> %a to <2 x i64> + ret void +} + +@a = common global i32 0, align 4 +@c = common global [2 x i32] zeroinitializer, align 4 + +; PR21978. +; Make sure we support promotion of operands that produces a Value as opposed +; to an instruction. +; This used to cause a crash. +; OPTALL-LABEL: @promotionOfArgEndsUpInValue +; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i16* %addr + +; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i16 [[LD]] to i32 +; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw nsw i32 [[SEXT]], zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32]* @c, i64 0, i64 1), i32* @a) to i32) +; +; DISABLE-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw nsw i16 [[LD]], zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32]* @c, i64 0, i64 1), i32* @a) to i16) +; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32 +; +; OPTALL-NEXT: ret i32 [[RES]] +define i32 @promotionOfArgEndsUpInValue(i16* %addr) { +entry: + %val = load i16* %addr + %add = add nuw nsw i16 %val, zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32]* @c, i64 0, i64 1), i32* @a) to i16) + %conv3 = sext i16 %add to i32 + ret i32 %conv3 +} diff --git a/test/CodeGen/X86/coff-comdat.ll b/test/CodeGen/X86/coff-comdat.ll index bf27b2fff1fa..dcbbe1097d53 100644 --- a/test/CodeGen/X86/coff-comdat.ll +++ b/test/CodeGen/X86/coff-comdat.ll @@ -1,58 +1,58 @@ ; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck %s $f1 = comdat any -@v1 = global i32 0, comdat $f1 -define void @f1() comdat $f1 { +@v1 = global i32 0, comdat($f1) +define void @f1() comdat($f1) { ret void } $f2 = comdat exactmatch -@v2 = global i32 0, comdat $f2 -define void @f2() comdat $f2 { +@v2 = global i32 0, comdat($f2) +define void @f2() comdat($f2) { ret void } $f3 = comdat largest -@v3 = global i32 0, comdat $f3 -define void @f3() comdat $f3 { +@v3 = global i32 0, comdat($f3) +define void @f3() comdat($f3) { ret void } $f4 = comdat noduplicates -@v4 = global i32 0, comdat $f4 -define void @f4() comdat $f4 { +@v4 = global i32 0, comdat($f4) +define void @f4() comdat($f4) { ret void } $f5 = comdat samesize -@v5 = global i32 0, comdat $f5 -define void @f5() comdat $f5 { +@v5 = global i32 0, comdat($f5) +define void @f5() comdat($f5) { ret void } $f6 = comdat samesize -@v6 = global i32 0, comdat $f6 -@f6 = global i32 0, comdat $f6 +@v6 = global i32 0, comdat($f6) +@f6 = global i32 0, comdat($f6) $"\01@f7@0" = comdat any -define x86_fastcallcc void @"\01@v7@0"() comdat $"\01@f7@0" { +define x86_fastcallcc void @"\01@v7@0"() comdat($"\01@f7@0") { ret void } -define x86_fastcallcc void @"\01@f7@0"() comdat $"\01@f7@0" { +define x86_fastcallcc void @"\01@f7@0"() comdat($"\01@f7@0") { ret void } $f8 = comdat any -define x86_fastcallcc void @v8() comdat $f8 { +define x86_fastcallcc void @v8() comdat($f8) { ret void } -define x86_fastcallcc void @f8() comdat $f8 { +define x86_fastcallcc void @f8() comdat($f8) { ret void } $vftable = comdat largest -@some_name = private unnamed_addr constant [2 x i8*] zeroinitializer, comdat $vftable +@some_name = private unnamed_addr constant [2 x i8*] zeroinitializer, comdat($vftable) @vftable = alias getelementptr([2 x i8*]* @some_name, i32 0, i32 1) ; CHECK: .section .text,"xr",discard,_f1 @@ -73,19 +73,19 @@ $vftable = comdat largest ; CHECK: .globl @v8@0 ; CHECK: .section .text,"xr",discard,@f8@0 ; CHECK: .globl @f8@0 -; CHECK: .section .bss,"bw",associative,_f1 +; CHECK: .section .bss,"wb",associative,_f1 ; CHECK: .globl _v1 -; CHECK: .section .bss,"bw",associative,_f2 +; CHECK: .section .bss,"wb",associative,_f2 ; CHECK: .globl _v2 -; CHECK: .section .bss,"bw",associative,_f3 +; CHECK: .section .bss,"wb",associative,_f3 ; CHECK: .globl _v3 -; CHECK: .section .bss,"bw",associative,_f4 +; CHECK: .section .bss,"wb",associative,_f4 ; CHECK: .globl _v4 -; CHECK: .section .bss,"bw",associative,_f5 +; CHECK: .section .bss,"wb",associative,_f5 ; CHECK: .globl _v5 -; CHECK: .section .bss,"bw",associative,_f6 +; CHECK: .section .bss,"wb",associative,_f6 ; CHECK: .globl _v6 -; CHECK: .section .bss,"bw",same_size,_f6 +; CHECK: .section .bss,"wb",same_size,_f6 ; CHECK: .globl _f6 ; CHECK: .section .rdata,"rd",largest,_vftable ; CHECK: .globl _vftable diff --git a/test/CodeGen/X86/coff-comdat2.ll b/test/CodeGen/X86/coff-comdat2.ll index 6744b5b02ad7..a417d096c47d 100644 --- a/test/CodeGen/X86/coff-comdat2.ll +++ b/test/CodeGen/X86/coff-comdat2.ll @@ -5,5 +5,5 @@ target triple = "i686-pc-windows-msvc" $foo = comdat largest @foo = global i32 0 -@bar = global i32 0, comdat $foo -; CHECK: Associative COMDAT symbol 'foo' is not a key for it's COMDAT. +@bar = global i32 0, comdat($foo) +; CHECK: Associative COMDAT symbol 'foo' is not a key for its COMDAT. diff --git a/test/CodeGen/X86/coff-comdat3.ll b/test/CodeGen/X86/coff-comdat3.ll index 76e464b27547..01651ce4820a 100644 --- a/test/CodeGen/X86/coff-comdat3.ll +++ b/test/CodeGen/X86/coff-comdat3.ll @@ -4,5 +4,5 @@ target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" target triple = "i686-pc-windows-msvc" $foo = comdat largest -@bar = global i32 0, comdat $foo +@bar = global i32 0, comdat($foo) ; CHECK: Associative COMDAT symbol 'foo' does not exist. diff --git a/test/CodeGen/X86/combine-and.ll b/test/CodeGen/X86/combine-and.ll new file mode 100644 index 000000000000..dace806b4bb9 --- /dev/null +++ b/test/CodeGen/X86/combine-and.ll @@ -0,0 +1,164 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s +; +; Verify that the DAGCombiner is able to fold a vector AND into a blend +; if one of the operands to the AND is a vector of all constants, and each +; constant element is either zero or all-ones. + + +define <4 x i32> @test1(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test1 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test2(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test2 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test3(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test3 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test4(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 0, i32 0, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test4 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test5(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test5 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test6(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test6 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test7(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 0, i32 -1, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test7 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test8(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 0, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test8 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test9(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test9 +; CHECK: movq %xmm0, %xmm0 +; CHECK-NEXT: retq + + +define <4 x i32> @test10(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test10 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test11(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 -1, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test11 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test12(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 -1, i32 0> + ret <4 x i32> %1 +} +; CHECK-LABEL: test12 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test13(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test13 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test14(<4 x i32> %A) { + %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1> + ret <4 x i32> %1 +} +; CHECK-LABEL: test14 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test15(<4 x i32> %A, <4 x i32> %B) { + %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1> + %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 0> + %3 = or <4 x i32> %1, %2 + ret <4 x i32> %3 +} +; CHECK-LABEL: test15 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test16(<4 x i32> %A, <4 x i32> %B) { + %1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0> + %2 = and <4 x i32> %B, <i32 0, i32 -1, i32 0, i32 -1> + %3 = or <4 x i32> %1, %2 + ret <4 x i32> %3 +} +; CHECK-LABEL: test16 +; CHECK: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-NEXT: retq + + +define <4 x i32> @test17(<4 x i32> %A, <4 x i32> %B) { + %1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1> + %2 = and <4 x i32> %B, <i32 -1, i32 0, i32 -1, i32 0> + %3 = or <4 x i32> %1, %2 + ret <4 x i32> %3 +} +; CHECK-LABEL: test17 +; CHECK: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll index df3b9015adda..280fcbc7a3a7 100644 --- a/test/CodeGen/X86/combine-or.ll +++ b/test/CodeGen/X86/combine-or.ll @@ -5,277 +5,290 @@ ; instruction which performs a blend operation. define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test1: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1> %or = or <2 x i64> %shuf1, %shuf2 ret <2 x i64> %or } -; CHECK-LABEL: test1 -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK-NOT: orps -; CHECK: ret define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test2: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test2 -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK: ret define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test3: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2> %or = or <2 x i64> %shuf1, %shuf2 ret <2 x i64> %or } -; CHECK-LABEL: test3 -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK-NEXT: ret define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test4: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test4 -; CHECK-NOT: xorps -; CHECK: movss -; CHECK-NOT: orps -; CHECK: ret define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test5: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test5 -; CHECK-NOT: xorps -; CHECK: movss -; CHECK-NEXT: ret define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test6: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test6 -; CHECK-NOT: xorps -; CHECK: blendps $12 -; CHECK-NEXT: ret define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test7: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; CHECK-NEXT: retq %and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0> %and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1> %or = or <4 x i32> %and1, %and2 ret <4 x i32> %or } -; CHECK-LABEL: test7 -; CHECK-NOT: xorps -; CHECK: blendps $12 -; CHECK-NEXT: ret define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test8: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; CHECK-NEXT: retq %and1 = and <2 x i64> %a, <i64 -1, i64 0> %and2 = and <2 x i64> %b, <i64 0, i64 -1> %or = or <2 x i64> %and1, %and2 ret <2 x i64> %or } -; CHECK-LABEL: test8 -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK-NOT: orps -; CHECK: ret define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test9: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq %and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1> %and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0> %or = or <4 x i32> %and1, %and2 ret <4 x i32> %or } -; CHECK-LABEL: test9 -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK: ret define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test10: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq %and1 = and <2 x i64> %a, <i64 0, i64 -1> %and2 = and <2 x i64> %b, <i64 -1, i64 0> %or = or <2 x i64> %and1, %and2 ret <2 x i64> %or } -; CHECK-LABEL: test10 -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK-NEXT: ret define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test11: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; CHECK-NEXT: retq %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0> %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1> %or = or <4 x i32> %and1, %and2 ret <4 x i32> %or } -; CHECK-LABEL: test11 -; CHECK-NOT: xorps -; CHECK: movss -; CHECK-NOT: orps -; CHECK: ret define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test12: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; CHECK-NEXT: retq %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1> %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0> %or = or <4 x i32> %and1, %and2 ret <4 x i32> %or } -; CHECK-LABEL: test12 -; CHECK-NOT: xorps -; CHECK: movss -; CHECK-NEXT: ret ; Verify that the following test cases are folded into single shuffles. define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test13: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test13 -; CHECK-NOT: xorps -; CHECK: shufps -; CHECK-NEXT: ret define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test14: +; CHECK: # BB#0: +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0> %or = or <2 x i64> %shuf1, %shuf2 ret <2 x i64> %or } -; CHECK-LABEL: test14 -; CHECK-NOT: pslldq -; CHECK-NOT: por -; CHECK: punpcklqdq -; CHECK-NEXT: ret define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test15: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[2,1] +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test15 -; CHECK-NOT: xorps -; CHECK: shufps -; CHECK-NOT: shufps -; CHECK-NOT: orps -; CHECK: ret define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test16: +; CHECK: # BB#0: +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2> %or = or <2 x i64> %shuf1, %shuf2 ret <2 x i64> %or } -; CHECK-LABEL: test16 -; CHECK-NOT: pslldq -; CHECK-NOT: por -; CHECK: punpcklqdq -; CHECK: ret ; Verify that the dag-combiner does not fold a OR of two shuffles into a single ; shuffle instruction when the shuffle indexes are not compatible. define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test17: +; CHECK: # BB#0: +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,0] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; CHECK-NEXT: orps %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test17 -; CHECK: por -; CHECK-NEXT: ret define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test18: +; CHECK: # BB#0: +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test18 -; CHECK: orps -; CHECK: ret define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test19: +; CHECK: # BB#0: +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,3] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; CHECK-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,2] +; CHECK-NEXT: orps %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2> %or = or <4 x i32> %shuf1, %shuf2 ret <4 x i32> %or } -; CHECK-LABEL: test19 -; CHECK: por -; CHECK-NEXT: ret define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test20: +; CHECK: # BB#0: +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: movq %xmm0, %xmm0 +; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2> %or = or <2 x i64> %shuf1, %shuf2 ret <2 x i64> %or } -; CHECK-LABEL: test20 -; CHECK-NOT: xorps -; CHECK: orps -; CHECK-NEXT: movq -; CHECK-NEXT: ret define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test21: +; CHECK: # BB#0: +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0> %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0> %or = or <2 x i64> %shuf1, %shuf2 ret <2 x i64> %or } -; CHECK-LABEL: test21 -; CHECK: por -; CHECK-NEXT: pslldq -; CHECK-NEXT: ret ; Verify that the DAGCombiner doesn't crash in the attempt to check if a shuffle ; with illegal type has a legal mask. Method 'isShuffleMaskLegal' only knows how to ; handle legal vector value types. define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) { +; CHECK-LABEL: test_crash: +; CHECK: # BB#0: +; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; CHECK-NEXT: retq %shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3> %shuf2 = shufflevector <4 x i8> %b, <4 x i8> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4> %or = or <4 x i8> %shuf1, %shuf2 ret <4 x i8> %or } -; CHECK-LABEL: test_crash -; CHECK: movsd -; CHECK: ret diff --git a/test/CodeGen/X86/combine-vec-shuffle-2.ll b/test/CodeGen/X86/combine-vec-shuffle-2.ll deleted file mode 100644 index 877d38260d61..000000000000 --- a/test/CodeGen/X86/combine-vec-shuffle-2.ll +++ /dev/null @@ -1,253 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s - -; Check that DAGCombiner correctly folds the following pairs of shuffles -; using the following rules: -; 1. shuffle(shuffle(x, y), undef) -> x -; 2. shuffle(shuffle(x, y), undef) -> y -; 3. shuffle(shuffle(x, y), undef) -> shuffle(x, undef) -; 4. shuffle(shuffle(x, y), undef) -> shuffle(undef, y) -; -; Rules 3. and 4. are used only if the resulting shuffle mask is legal. - -define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test1 -; Mask: [3,0,0,1] -; CHECK: pshufd $67 -; CHECK-NEXT: ret - - -define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test2 -; Mask: [2,0,0,3] -; CHECK: pshufd $-62 -; CHECK-NEXT: ret - - -define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test3 -; Mask: [2,0,0,3] -; CHECK: pshufd $-62 -; CHECK-NEXT: ret - - -define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test4 -; Mask: [0,0,0,1] -; CHECK: pshufd $64 -; CHECK-NEXT: ret - - -define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test5 -; Mask: [1,1] -; CHECK: movhlps -; CHECK-NEXT: ret - - -define <4 x i32> @test6(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4> - ret <4 x i32> %2 -} -; CHECK-LABEL: test6 -; Mask: [2,0,0,0] -; CHECK: pshufd $2 -; CHECK-NEXT: ret - - -define <4 x i32> @test7(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> - ret <4 x i32> %2 -} -; CHECK-LABEL: test7 -; Mask: [0,2,0,2] -; CHECK: pshufd $-120 -; CHECK-NEXT: ret - - -define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4> - ret <4 x i32> %2 -} -; CHECK-LABEL: test8 -; Mask: [1,0,3,0] -; CHECK: pshufd $49 -; CHECK-NEXT: ret - - -define <4 x i32> @test9(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2> - ret <4 x i32> %2 -} -; CHECK-LABEL: test9 -; Mask: [1,3,0,2] -; CHECK: pshufd $-115 -; CHECK-NEXT: ret - - -define <4 x i32> @test10(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4> - ret <4 x i32> %2 -} -; CHECK-LABEL: test10 -; Mask: [1,0,1,0] -; CHECK: pshufd $17 -; CHECK-NEXT: ret - - -define <4 x i32> @test11(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0> - ret <4 x i32> %2 -} -; CHECK-LABEL: test11 -; Mask: [1,0,2,1] -; CHECK: pshufd $97 -; CHECK-NEXT: ret - - -define <4 x i32> @test12(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4> - ret <4 x i32> %2 -} -; CHECK-LABEL: test12 -; Mask: [0,0,0,0] -; CHECK: pshufd $0 -; CHECK-NEXT: ret - - -; The following pair of shuffles is folded into vector %A. -define <4 x i32> @test13(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4> - ret <4 x i32> %2 -} -; CHECK-LABEL: test13 -; CHECK-NOT: pshufd -; CHECK: ret - - -; The following pair of shuffles is folded into vector %B. -define <4 x i32> @test14(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4> - ret <4 x i32> %2 -} -; CHECK-LABEL: test14 -; CHECK-NOT: pshufd -; CHECK: ret - - -; Verify that we don't optimize the following cases. We expect more than one shuffle. - -define <4 x i32> @test15(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test15 -; CHECK: shufps $114 -; CHECK-NEXT: pshufd $-58 -; CHECK-NEXT: ret - - -define <4 x i32> @test16(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test16 -; CHECK: blendps $10 -; CHECK-NEXT: pshufd $-58 -; CHECK-NEXT: ret - - -define <4 x i32> @test17(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test17 -; CHECK: shufps $120 -; CHECK-NEXT: pshufd $-58 -; CHECK-NEXT: ret - - -define <4 x i32> @test18(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test18 -; CHECK: blendps $11 -; CHECK-NEXT: pshufd $-59 -; CHECK-NEXT: ret - -define <4 x i32> @test19(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0> - ret <4 x i32> %2 -} -; CHECK-LABEL: test19 -; CHECK: shufps $-104 -; CHECK-NEXT: pshufd $2 -; CHECK-NEXT: ret - - -define <4 x i32> @test20(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test20 -; CHECK: shufps $11 -; CHECK-NEXT: pshufd $-58 -; CHECK-NEXT: ret - - -define <4 x i32> @test21(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test21 -; CHECK: shufps $120 -; CHECK-NEXT: pshufd $-60 -; CHECK-NEXT: ret - - -define <4 x i32> @test22(<4 x i32> %A, <4 x i32> %B) { - %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test22 -; CHECK: blendps $11 -; CHECK-NEXT: pshufd $-43 -; CHECK-NEXT: ret - diff --git a/test/CodeGen/X86/combine-vec-shuffle-3.ll b/test/CodeGen/X86/combine-vec-shuffle-3.ll deleted file mode 100644 index bd2d34ca189a..000000000000 --- a/test/CodeGen/X86/combine-vec-shuffle-3.ll +++ /dev/null @@ -1,380 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s - -define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test1 -; Mask: [0,1,2,3] -; CHECK: movaps -; CHECK: ret - -define <4 x float> @test2(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test2 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x float> @test3(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> - ret <4 x float> %2 -} -; CHECK-LABEL: test3 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x float> @test4(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x float> %2 -} -; CHECK-LABEL: test4 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test5 -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - - -define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x i32> %2 -} -; CHECK-LABEL: test6 -; Mask: [4,5,6,7] -; CHECK: movaps -; CHECK: ret - -define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test7 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x i32> @test8(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> - ret <4 x i32> %2 -} -; CHECK-LABEL: test8 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x i32> %2 -} -; CHECK-LABEL: test9 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x i32> @test10(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> - ret <4 x i32> %2 -} -; CHECK-LABEL: test10 -; Mask: [4,1,6,7] -; CHECK: blendps -; CHECK: ret - -define <4 x float> @test11(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test11 -; Mask: [0,1,2,3] -; CHECK-NOT: movaps -; CHECK-NOT: blendps -; CHECK: ret - -define <4 x float> @test12(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test12 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x float> @test13(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test13 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x float> @test14(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x float> %2 -} -; CHECK-LABEL: test14 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK: ret - -define <4 x float> @test15(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test15 -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - -define <4 x i32> @test16(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test16 -; Mask: [0,1,2,3] -; CHECK-NOT: movaps -; CHECK-NOT: blendps -; CHECK: ret - -define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test17 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test18 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x i32> %2 -} -; CHECK-LABEL: test19 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK: ret - -define <4 x i32> @test20(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> - %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test20 -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - -; Check some negative cases. -define <4 x float> @test1b(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0> - ret <4 x float> %2 -} -; CHECK-LABEL: test1b -; CHECK: shufps -; CHECK: shufps -; CHECK: ret - -define <4 x float> @test2b(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5> - ret <4 x float> %2 -} -; CHECK-LABEL: test2b -; CHECK: shufps -; CHECK: pshufd -; CHECK: ret - -define <4 x float> @test3b(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test3b -; CHECK: shufps -; CHECK: shufps -; CHECK: ret - -define <4 x float> @test4b(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test4b -; CHECK: shufps -; CHECK: shufps -; CHECK: ret - - -; Verify that we correctly fold shuffles even when we use illegal vector types. -define <4 x i8> @test1c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3> - ret <4 x i8> %2 -} -; CHECK-LABEL: test1c -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK-NEXT: ret - -define <4 x i8> @test2c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> - %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1> - ret <4 x i8> %2 -} -; CHECK-LABEL: test2c -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK-NEXT: ret - -define <4 x i8> @test3c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> - %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x i8> %2 -} -; CHECK-LABEL: test3c -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x i8> @test4c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7> - ret <4 x i8> %2 -} -; CHECK-LABEL: test4c -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - -; The following test cases are generated from this C++ code -; -;__m128 blend_01(__m128 a, __m128 b) -;{ -; __m128 s = a; -; s = _mm_blend_ps( s, b, 1<<0 ); -; s = _mm_blend_ps( s, b, 1<<1 ); -; return s; -;} -; -;__m128 blend_02(__m128 a, __m128 b) -;{ -; __m128 s = a; -; s = _mm_blend_ps( s, b, 1<<0 ); -; s = _mm_blend_ps( s, b, 1<<2 ); -; return s; -;} -; -;__m128 blend_123(__m128 a, __m128 b) -;{ -; __m128 s = a; -; s = _mm_blend_ps( s, b, 1<<1 ); -; s = _mm_blend_ps( s, b, 1<<2 ); -; s = _mm_blend_ps( s, b, 1<<3 ); -; return s; -;} - -; Ideally, we should collapse the following shuffles into a single one. - -define <4 x float> @blend_01(<4 x float> %a, <4 x float> %b) { - %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3> - %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> - ret <4 x float> %shuffle6 -} -; CHECK-LABEL: blend_01 -; CHECK: movsd -; CHECK-NEXT: ret - -define <4 x float> @blend_02(<4 x float> %a, <4 x float> %b) { - %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3> - %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> - ret <4 x float> %shuffle6 -} -; CHECK-LABEL: blend_02 -; CHECK: blendps $5 -; CHECK-NEXT: ret - -define <4 x float> @blend_123(<4 x float> %a, <4 x float> %b) { - %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> - %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef> - %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> - ret <4 x float> %shuffle12 -} -; CHECK-LABEL: blend_123 -; CHECK: movss -; CHECK: ret - -define <4 x i32> @test_movhl_1(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test_movhl_1 -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x i32> @test_movhl_2(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2> - ret <4 x i32> %2 -} -; CHECK-LABEL: test_movhl_2 -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x i32> @test_movhl_3(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2> - %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2> - ret <4 x i32> %2 -} -; CHECK-LABEL: test_movhl_3 -; CHECK: movhlps -; CHECK-NEXT: ret - diff --git a/test/CodeGen/X86/combine-vec-shuffle-4.ll b/test/CodeGen/X86/combine-vec-shuffle-4.ll deleted file mode 100644 index 0ddec2c12fb5..000000000000 --- a/test/CodeGen/X86/combine-vec-shuffle-4.ll +++ /dev/null @@ -1,237 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s - -; Verify that we fold shuffles according to rule: -; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2) - -define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2> - ret <4 x float> %2 -} -; CHECK-LABEL: test1 -; Mask: [4,5,2,3] -; CHECK: movsd -; CHECK: ret - -define <4 x float> @test2(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5> - ret <4 x float> %2 -} -; CHECK-LABEL: test2 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x float> @test3(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> - ret <4 x float> %2 -} -; CHECK-LABEL: test3 -; Mask: [0,1,4,u] -; CHECK: movlhps -; CHECK: ret - -define <4 x float> @test4(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x float> %2 -} -; CHECK-LABEL: test4 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test5 -; Mask: [0,1,6,7] -; CHECK: blendps $12 -; CHECK: ret - -; Verify that we fold shuffles according to rule: -; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) - -define <4 x float> @test6(<4 x float> %a) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2> - ret <4 x float> %2 -} -; CHECK-LABEL: test6 -; Mask: [0,1,2,3] -; CHECK-NOT: pshufd -; CHECK-NOT: shufps -; CHECK-NOT: movlhps -; CHECK: ret - -define <4 x float> @test7(<4 x float> %a) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5> - ret <4 x float> %2 -} -; CHECK-LABEL: test7 -; Mask: [0,1,0,1] -; CHECK-NOT: pshufd -; CHECK-NOT: shufps -; CHECK: movlhps -; CHECK-NEXT: ret - -define <4 x float> @test8(<4 x float> %a) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1> - ret <4 x float> %2 -} -; CHECK-LABEL: test8 -; Mask: [0,1,0,u] -; CHECK-NOT: pshufd -; CHECK-NOT: shufps -; CHECK: movlhps -; CHECK-NEXT: ret - -define <4 x float> @test9(<4 x float> %a) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> - ret <4 x float> %2 -} -; CHECK-LABEL: test9 -; Mask: [2,3,2,3] -; CHECK-NOT: movlhps -; CHECK-NOT: palignr -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x float> @test10(<4 x float> %a) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test10 -; Mask: [0,1,2,3] -; CHECK-NOT: pshufd -; CHECK-NOT: shufps -; CHECK-NOT: movlhps -; CHECK: ret - -define <4 x float> @test11(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> - %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6> - ret <4 x float> %2 -} -; CHECK-LABEL: test11 -; Mask: [4,5,2,3] -; CHECK: movsd -; CHECK: ret - -define <4 x float> @test12(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> - %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> - ret <4 x float> %2 -} -; CHECK-LABEL: test12 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x float> @test13(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> - %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5> - ret <4 x float> %2 -} -; CHECK-LABEL: test13 -; Mask: [0,1,4,u] -; CHECK: movlhps -; CHECK: ret - -define <4 x float> @test14(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> - %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> - ret <4 x float> %2 -} -; CHECK-LABEL: test14 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x float> @test15(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> - %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test15 -; Mask: [0,1,6,7] -; CHECK: blendps $12 -; CHECK: ret - -; Verify that shuffles are canonicalized according to rules: -; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) -; -; This allows to trigger the following combine rule: -; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) -; -; As a result, all the shuffle pairs in each function below should be -; combined into a single legal shuffle operation. - -define <4 x float> @test16(<4 x float> %a) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> - %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test16 -; Mask: [0,1,2,3] -; CHECK-NOT: pshufd -; CHECK-NOT: shufps -; CHECK-NOT: movlhps -; CHECK: ret - -define <4 x float> @test17(<4 x float> %a) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> - %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> - ret <4 x float> %2 -} -; CHECK-LABEL: test17 -; Mask: [0,1,0,1] -; CHECK-NOT: pshufd -; CHECK-NOT: shufps -; CHECK: movlhps -; CHECK-NEXT: ret - -define <4 x float> @test18(<4 x float> %a) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> - %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> - ret <4 x float> %2 -} -; CHECK-LABEL: test18 -; Mask: [0,1,0,u] -; CHECK-NOT: pshufd -; CHECK-NOT: shufps -; CHECK: movlhps -; CHECK-NEXT: ret - -define <4 x float> @test19(<4 x float> %a) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> - %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> - ret <4 x float> %2 -} -; CHECK-LABEL: test19 -; Mask: [2,3,2,3] -; CHECK-NOT: movlhps -; CHECK-NOT: palignr -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x float> @test20(<4 x float> %a) { - %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> - %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test20 -; Mask: [0,1,2,3] -; CHECK-NOT: pshufd -; CHECK-NOT: shufps -; CHECK-NOT: movlhps -; CHECK: ret - diff --git a/test/CodeGen/X86/combine-vec-shuffle-5.ll b/test/CodeGen/X86/combine-vec-shuffle-5.ll deleted file mode 100644 index 16c45efe4be6..000000000000 --- a/test/CodeGen/X86/combine-vec-shuffle-5.ll +++ /dev/null @@ -1,257 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s - -; Verify that the DAGCombiner correctly folds all the shufflevector pairs -; into a single shuffle operation. - -define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test1 -; Mask: [0,1,2,3] -; CHECK: movaps -; CHECK: ret - -define <4 x float> @test2(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 2, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test2 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x float> @test3(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> - %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> - ret <4 x float> %2 -} -; CHECK-LABEL: test3 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x float> @test4(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> - %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> - ret <4 x float> %2 -} -; CHECK-LABEL: test4 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 6, i32 3> - ret <4 x float> %2 -} -; CHECK-LABEL: test5 -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - - -define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test6 -; Mask: [4,5,6,7] -; CHECK: movaps -; CHECK: ret - -define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <4 x i32> <i32 4, i32 5, i32 2, i32 7> - ret <4 x i32> %2 -} -; CHECK-LABEL: test7 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x i32> @test8(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> - %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> - ret <4 x i32> %2 -} -; CHECK-LABEL: test8 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> - %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> - ret <4 x i32> %2 -} -; CHECK-LABEL: test9 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK-NEXT: ret - -define <4 x i32> @test10(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <4 x i32> <i32 4, i32 5, i32 6, i32 3> - ret <4 x i32> %2 -} -; CHECK-LABEL: test10 -; Mask: [4,1,6,7] -; CHECK: blendps -; CHECK: ret - -define <4 x float> @test11(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test11 -; Mask: [0,1,2,3] -; CHECK-NOT: movaps -; CHECK-NOT: blendps -; CHECK: ret - -define <4 x float> @test12(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test12 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x float> @test13(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test13 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x float> @test14(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> - %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> - ret <4 x float> %2 -} -; CHECK-LABEL: test14 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK: ret - -define <4 x float> @test15(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> - %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 1, i32 6, i32 7> - ret <4 x float> %2 -} -; CHECK-LABEL: test15 -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - -define <4 x i32> @test16(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i32> %a, <4 x i32> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x i32> %2 -} -; CHECK-LABEL: test16 -; Mask: [0,1,2,3] -; CHECK-NOT: movaps -; CHECK-NOT: blendps -; CHECK: ret - -define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - %2 = shufflevector <4 x i32> %a, <4 x i32> %1, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x i32> %2 -} -; CHECK-LABEL: test17 -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK: ret - -define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - %2 = shufflevector <4 x i32> %a, <4 x i32> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 7> - ret <4 x i32> %2 -} -; CHECK-LABEL: test18 -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK: ret - -define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> - %2 = shufflevector <4 x i32> %a, <4 x i32> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> - ret <4 x i32> %2 -} -; CHECK-LABEL: test19 -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK: ret - -define <4 x i32> @test20(<4 x i32> %a, <4 x i32> %b) { - %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> - %2 = shufflevector <4 x i32> %a, <4 x i32> %1, <4 x i32> <i32 4, i32 1, i32 6, i32 7> - ret <4 x i32> %2 -} -; CHECK-LABEL: test20 -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - -; Verify that we correctly fold shuffles even when we use illegal vector types. -define <4 x i8> @test1c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - %2 = shufflevector <4 x i8> %B, <4 x i8> %1, <4 x i32> <i32 4, i32 5, i32 2, i32 7> - ret <4 x i8> %2 -} -; CHECK-LABEL: test1c -; Mask: [0,5,6,7] -; CHECK: movss -; CHECK-NEXT: ret - -define <4 x i8> @test2c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> - %2 = shufflevector <4 x i8> %B, <4 x i8> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> - ret <4 x i8> %2 -} -; CHECK-LABEL: test2c -; Mask: [0,1,4,5] -; CHECK: movlhps -; CHECK-NEXT: ret - -define <4 x i8> @test3c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> - %2 = shufflevector <4 x i8> %B, <4 x i8> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> - ret <4 x i8> %2 -} -; CHECK-LABEL: test3c -; Mask: [6,7,2,3] -; CHECK: movhlps -; CHECK: ret - -define <4 x i8> @test4c(<4 x i8>* %a, <4 x i8>* %b) { - %A = load <4 x i8>* %a - %B = load <4 x i8>* %b - %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> - %2 = shufflevector <4 x i8> %B, <4 x i8> %1, <4 x i32> <i32 4, i32 5, i32 6, i32 3> - ret <4 x i8> %2 -} -; CHECK-LABEL: test4c -; Mask: [4,1,6,7] -; CHECK: blendps $13 -; CHECK: ret - diff --git a/test/CodeGen/X86/combine-vec-shuffle.ll b/test/CodeGen/X86/combine-vec-shuffle.ll deleted file mode 100644 index 9e6ab892713b..000000000000 --- a/test/CodeGen/X86/combine-vec-shuffle.ll +++ /dev/null @@ -1,253 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s - -; Verify that the DAGCombiner correctly folds according to the following rules: - -; fold (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C) -; fold (OR (shuf (A, C), shuf (B, C)) -> shuf (OR (A, B), C) -; fold (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0) - -; fold (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B)) -; fold (OR (shuf (C, A), shuf (C, B)) -> shuf (C, OR (A, B)) -; fold (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B)) - - - -define <4 x i32> @test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> - %and = and <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %and -} -; CHECK-LABEL: test1 -; CHECK-NOT: pshufd -; CHECK: pand -; CHECK-NEXT: pshufd -; CHECK-NEXT: ret - - -define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> - %or = or <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %or -} -; CHECK-LABEL: test2 -; CHECK-NOT: pshufd -; CHECK: por -; CHECK-NEXT: pshufd -; CHECK-NEXT: ret - - -define <4 x i32> @test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> - %xor = xor <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %xor -} -; CHECK-LABEL: test3 -; CHECK-NOT: pshufd -; CHECK: pxor -; CHECK-NEXT: pshufd -; CHECK-NEXT: ret - - -define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> - %and = and <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %and -} -; CHECK-LABEL: test4 -; CHECK-NOT: pshufd -; CHECK: pand -; CHECK-NEXT: pshufd -; CHECK-NEXT: ret - - -define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> - %or = or <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %or -} -; CHECK-LABEL: test5 -; CHECK-NOT: pshufd -; CHECK: por -; CHECK-NEXT: pshufd -; CHECK-NEXT: ret - - -define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> - %xor = xor <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %xor -} -; CHECK-LABEL: test6 -; CHECK-NOT: pshufd -; CHECK: pxor -; CHECK-NEXT: pshufd -; CHECK-NEXT: ret - - -; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles -; are not performing a swizzle operations. - -define <4 x i32> @test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %and = and <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %and -} -; CHECK-LABEL: test1b -; CHECK-NOT: blendps -; CHECK: andps -; CHECK-NEXT: blendps -; CHECK-NEXT: ret - - -define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %or = or <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %or -} -; CHECK-LABEL: test2b -; CHECK-NOT: blendps -; CHECK: orps -; CHECK-NEXT: blendps -; CHECK-NEXT: ret - - -define <4 x i32> @test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %xor = xor <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %xor -} -; CHECK-LABEL: test3b -; CHECK-NOT: blendps -; CHECK: xorps -; CHECK-NEXT: xorps -; CHECK-NEXT: blendps -; CHECK-NEXT: ret - - -define <4 x i32> @test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %and = and <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %and -} -; CHECK-LABEL: test4b -; CHECK-NOT: blendps -; CHECK: andps -; CHECK-NEXT: blendps -; CHECK: ret - - -define <4 x i32> @test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %or = or <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %or -} -; CHECK-LABEL: test5b -; CHECK-NOT: blendps -; CHECK: orps -; CHECK-NEXT: blendps -; CHECK: ret - - -define <4 x i32> @test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> - %xor = xor <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %xor -} -; CHECK-LABEL: test6b -; CHECK-NOT: blendps -; CHECK: xorps -; CHECK-NEXT: xorps -; CHECK-NEXT: blendps -; CHECK: ret - -define <4 x i32> @test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %and = and <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %and -} -; CHECK-LABEL: test1c -; CHECK-NOT: shufps -; CHECK: andps -; CHECK-NEXT: shufps -; CHECK-NEXT: ret - - -define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %or = or <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %or -} -; CHECK-LABEL: test2c -; CHECK-NOT: shufps -; CHECK: orps -; CHECK-NEXT: shufps -; CHECK-NEXT: ret - - -define <4 x i32> @test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %xor = xor <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %xor -} -; CHECK-LABEL: test3c -; CHECK-NOT: shufps -; CHECK: xorps -; CHECK-NEXT: xorps -; CHECK-NEXT: shufps -; CHECK-NEXT: ret - - -define <4 x i32> @test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %and = and <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %and -} -; CHECK-LABEL: test4c -; CHECK-NOT: shufps -; CHECK: andps -; CHECK-NEXT: shufps -; CHECK: ret - - -define <4 x i32> @test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %or = or <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %or -} -; CHECK-LABEL: test5c -; CHECK-NOT: shufps -; CHECK: orps -; CHECK-NEXT: shufps -; CHECK: ret - - -define <4 x i32> @test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { - %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> - %xor = xor <4 x i32> %shuf1, %shuf2 - ret <4 x i32> %xor -} -; CHECK-LABEL: test6c -; CHECK-NOT: shufps -; CHECK: xorps -; CHECK-NEXT: xorps -; CHECK-NEXT: shufps -; CHECK: ret - diff --git a/test/CodeGen/X86/commute-blend-avx2.ll b/test/CodeGen/X86/commute-blend-avx2.ll new file mode 100644 index 000000000000..d06c6dad8dbf --- /dev/null +++ b/test/CodeGen/X86/commute-blend-avx2.ll @@ -0,0 +1,89 @@ +; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=core-avx2 -mattr=avx2 < %s | FileCheck %s + +define <8 x i16> @commute_fold_vpblendw_128(<8 x i16> %a, <8 x i16>* %b) #0 { + %1 = load <8 x i16>* %b + %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17) + ret <8 x i16> %2 + + ;LABEL: commute_fold_vpblendw_128 + ;CHECK: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] + ;CHECK-NEXT: retq +} +declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone + +define <16 x i16> @commute_fold_vpblendw_256(<16 x i16> %a, <16 x i16>* %b) #0 { + %1 = load <16 x i16>* %b + %2 = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %1, <16 x i16> %a, i8 17) + ret <16 x i16> %2 + + ;LABEL: commute_fold_vpblendw_256 + ;CHECK: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15] + ;CHECK-NEXT: retq +} +declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind readnone + +define <4 x i32> @commute_fold_vpblendd_128(<4 x i32> %a, <4 x i32>* %b) #0 { + %1 = load <4 x i32>* %b + %2 = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %1, <4 x i32> %a, i8 1) + ret <4 x i32> %2 + + ;LABEL: commute_fold_vpblendd_128 + ;CHECK: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] + ;CHECK-NEXT: retq +} +declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind readnone + +define <8 x i32> @commute_fold_vpblendd_256(<8 x i32> %a, <8 x i32>* %b) #0 { + %1 = load <8 x i32>* %b + %2 = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %1, <8 x i32> %a, i8 129) + ret <8 x i32> %2 + + ;LABEL: commute_fold_vpblendd_256 + ;CHECK: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7] + ;CHECK-NEXT: retq +} +declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind readnone + +define <4 x float> @commute_fold_vblendps_128(<4 x float> %a, <4 x float>* %b) #0 { + %1 = load <4 x float>* %b + %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3) + ret <4 x float> %2 + + ;LABEL: commute_fold_vblendps_128 + ;CHECK: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] + ;CHECK-NEXT: retq +} +declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone + +define <8 x float> @commute_fold_vblendps_256(<8 x float> %a, <8 x float>* %b) #0 { + %1 = load <8 x float>* %b + %2 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %1, <8 x float> %a, i8 7) + ret <8 x float> %2 + + ;LABEL: commute_fold_vblendps_256 + ;CHECK: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7] + ;CHECK-NEXT: retq +} +declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone + +define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 { + %1 = load <2 x double>* %b + %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1) + ret <2 x double> %2 + + ;LABEL: commute_fold_vblendpd_128 + ;CHECK: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] + ;CHECK-NEXT: retq +} +declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone + +define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 { + %1 = load <4 x double>* %b + %2 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %1, <4 x double> %a, i8 7) + ret <4 x double> %2 + + ;LABEL: commute_fold_vblendpd_256 + ;CHECK: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3] + ;CHECK-NEXT: retq +} +declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone diff --git a/test/CodeGen/X86/commute-blend-sse41.ll b/test/CodeGen/X86/commute-blend-sse41.ll new file mode 100644 index 000000000000..59fef8c3a29f --- /dev/null +++ b/test/CodeGen/X86/commute-blend-sse41.ll @@ -0,0 +1,34 @@ +; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=corei7 < %s | FileCheck %s + +define <8 x i16> @commute_fold_pblendw(<8 x i16> %a, <8 x i16>* %b) #0 { + %1 = load <8 x i16>* %b + %2 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %1, <8 x i16> %a, i8 17) + ret <8 x i16> %2 + + ;LABEL: commute_fold_pblendw + ;CHECK: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7] + ;CHECK-NEXT: retq +} +declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone + +define <4 x float> @commute_fold_blendps(<4 x float> %a, <4 x float>* %b) #0 { + %1 = load <4 x float>* %b + %2 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %1, <4 x float> %a, i8 3) + ret <4 x float> %2 + + ;LABEL: commute_fold_blendps + ;CHECK: blendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] + ;CHECK-NEXT: retq +} +declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone + +define <2 x double> @commute_fold_blendpd(<2 x double> %a, <2 x double>* %b) #0 { + %1 = load <2 x double>* %b + %2 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %1, <2 x double> %a, i8 1) + ret <2 x double> %2 + + ;LABEL: commute_fold_vblendpd + ;CHECK: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1] + ;CHECK-NEXT: retq +} +declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone diff --git a/test/CodeGen/X86/commuted-blend-mask.ll b/test/CodeGen/X86/commuted-blend-mask.ll new file mode 100644 index 000000000000..e6322cbb7a14 --- /dev/null +++ b/test/CodeGen/X86/commuted-blend-mask.ll @@ -0,0 +1,13 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s + +; When commuting the operands of a SSE blend, make sure that the resulting blend +; mask can be encoded as a imm8. +; Before, when commuting the operands to the shuffle in function @test, the backend +; produced the following assembly: +; pblendw $4294967103, %xmm1, %xmm0 + +define <4 x i32> @test(<4 x i32> %a, <4 x i32> %b) { + ;CHECK: pblendw $63, %xmm1, %xmm0 + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 3> + ret <4 x i32> %shuffle +} diff --git a/test/CodeGen/X86/compact-unwind.ll b/test/CodeGen/X86/compact-unwind.ll index 9d3a1257288c..d3b89a54e0b8 100644 --- a/test/CodeGen/X86/compact-unwind.ll +++ b/test/CodeGen/X86/compact-unwind.ll @@ -1,12 +1,20 @@ ; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 | FileCheck -check-prefix=ASM %s ; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 -filetype=obj -o - \ -; RUN: | llvm-objdump -triple x86_64-apple-darwin11 -s - \ +; RUN: | llvm-objdump -triple x86_64-apple-darwin11 -unwind-info - \ ; RUN: | FileCheck -check-prefix=CU %s ; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 \ ; RUN: | llvm-mc -triple x86_64-apple-darwin11 -filetype=obj -o - \ -; RUN: | llvm-objdump -triple x86_64-apple-darwin11 -s - \ +; RUN: | llvm-objdump -triple x86_64-apple-darwin11 -unwind-info - \ ; RUN: | FileCheck -check-prefix=FROM-ASM %s +; RUN: llc < %s -mtriple x86_64-apple-macosx10.8.0 -mcpu corei7 -filetype=obj -o - \ +; RUN: | llvm-objdump -triple x86_64-apple-macosx10.8.0 -unwind-info - \ +; RUN: | FileCheck -check-prefix=NOFP-CU %s +; RUN: llc < %s -mtriple x86_64-apple-darwin11 -mcpu corei7 \ +; RUN: | llvm-mc -triple x86_64-apple-darwin11 -filetype=obj -o - \ +; RUN: | llvm-objdump -triple x86_64-apple-darwin11 -unwind-info - \ +; RUN: | FileCheck -check-prefix=NOFP-FROM-ASM %s + %ty = type { i8* } @gv = external global i32 @@ -17,15 +25,19 @@ ; Even though we can't encode %rax into the compact unwind, We still want to be ; able to generate a compact unwind encoding in this particular case. -; CU: Contents of section __compact_unwind: -; CU-NEXT: 0020 00000000 00000000 1e000000 01000101 -; CU-NEXT: 0030 00000000 00000000 00000000 00000000 +; CU: Contents of __compact_unwind section: +; CU-NEXT: Entry at offset 0x0: +; CU-NEXT: start: 0x0 _test0 +; CU-NEXT: length: 0x1e +; CU-NEXT: compact encoding: 0x01010001 -; FROM-ASM: Contents of section __compact_unwind: -; FROM-ASM-NEXT: 0020 00000000 00000000 1e000000 01000101 -; FROM-ASM-NEXT: 0030 00000000 00000000 00000000 00000000 +; FROM-ASM: Contents of __compact_unwind section: +; FROM-ASM-NEXT: Entry at offset 0x0: +; FROM-ASM-NEXT: start: 0x0 _test0 +; FROM-ASM-NEXT: length: 0x1e +; FROM-ASM-NEXT: compact encoding: 0x01010001 -define i8* @foo(i64 %size) { +define i8* @test0(i64 %size) { %addr = alloca i64, align 8 %tmp20 = load i32* @gv, align 4 %tmp21 = call i32 @bar() @@ -39,3 +51,61 @@ define i8* @foo(i64 %size) { } declare i32 @bar() + +%"struct.dyld::MappedRanges" = type { [400 x %struct.anon], %"struct.dyld::MappedRanges"* } +%struct.anon = type { %class.ImageLoader*, i64, i64 } +%class.ImageLoader = type { i32 (...)**, i8*, i8*, i32, i64, i64, i32, i32, %"struct.ImageLoader::recursive_lock"*, i16, i16, [4 x i8] } +%"struct.ImageLoader::recursive_lock" = type { i32, i32 } + +@G1 = external hidden global %"struct.dyld::MappedRanges", align 8 + +declare void @OSMemoryBarrier() optsize + +; Test the code below uses UNWIND_X86_64_MODE_STACK_IMMD compact unwind +; encoding. + +; NOFP-CU: Entry at offset 0x20: +; NOFP-CU-NEXT: start: 0x1d _test1 +; NOFP-CU-NEXT: length: 0x42 +; NOFP-CU-NEXT: compact encoding: 0x02040c0a + +; NOFP-FROM-ASM: Entry at offset 0x20: +; NOFP-FROM-ASM-NEXT: start: 0x1d _test1 +; NOFP-FROM-ASM-NEXT: length: 0x42 +; NOFP-FROM-ASM-NEXT: compact encoding: 0x02040c0a + +define void @test1(%class.ImageLoader* %image) optsize ssp uwtable { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc10, %entry + %p.019 = phi %"struct.dyld::MappedRanges"* [ @G1, %entry ], [ %1, %for.inc10 ] + br label %for.body3 + +for.body3: ; preds = %for.inc, %for.cond1.preheader + %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ] + %image4 = getelementptr inbounds %"struct.dyld::MappedRanges"* %p.019, i64 0, i32 0, i64 %indvars.iv, i32 0 + %0 = load %class.ImageLoader** %image4, align 8 + %cmp5 = icmp eq %class.ImageLoader* %0, %image + br i1 %cmp5, label %if.then, label %for.inc + +if.then: ; preds = %for.body3 + tail call void @OSMemoryBarrier() optsize + store %class.ImageLoader* null, %class.ImageLoader** %image4, align 8 + br label %for.inc + +for.inc: ; preds = %if.then, %for.body3 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 400 + br i1 %exitcond, label %for.inc10, label %for.body3 + +for.inc10: ; preds = %for.inc + %next = getelementptr inbounds %"struct.dyld::MappedRanges"* %p.019, i64 0, i32 1 + %1 = load %"struct.dyld::MappedRanges"** %next, align 8 + %cmp = icmp eq %"struct.dyld::MappedRanges"* %1, null + br i1 %cmp, label %for.end11, label %for.cond1.preheader + +for.end11: ; preds = %for.inc10 + ret void +} diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll index b57889643e0d..7160dcc614c0 100644 --- a/test/CodeGen/X86/constructor.ll +++ b/test/CodeGen/X86/constructor.ll @@ -1,6 +1,8 @@ -; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=CTOR %s -; RUN: llc -mtriple x86_64-pc-linux -use-init-array < %s | FileCheck --check-prefix=INIT-ARRAY %s -@llvm.global_ctors = appending global [2 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @f }, { i32, void ()* } { i32 15, void ()* @g }] +; RUN: llc -mtriple x86_64-pc-linux -use-ctors < %s | FileCheck --check-prefix=CTOR %s +; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=INIT-ARRAY %s +@llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null}, { i32, void ()*, i8* } { i32 15, void ()* @g, i8* @v }] + +@v = weak_odr global i8 0 define void @f() { entry: @@ -12,14 +14,14 @@ entry: ret void } -; CTOR: .section .ctors.65520,"aw",@progbits +; CTOR: .section .ctors.65520,"aGw",@progbits,v,comdat ; CTOR-NEXT: .align 8 ; CTOR-NEXT: .quad g ; CTOR-NEXT: .section .ctors,"aw",@progbits ; CTOR-NEXT: .align 8 ; CTOR-NEXT: .quad f -; INIT-ARRAY: .section .init_array.15,"aw",@init_array +; INIT-ARRAY: .section .init_array.15,"aGw",@init_array,v,comdat ; INIT-ARRAY-NEXT: .align 8 ; INIT-ARRAY-NEXT: .quad g ; INIT-ARRAY-NEXT: .section .init_array,"aw",@init_array diff --git a/test/CodeGen/X86/copysign-constant-magnitude.ll b/test/CodeGen/X86/copysign-constant-magnitude.ll new file mode 100644 index 000000000000..537d6298ddf4 --- /dev/null +++ b/test/CodeGen/X86/copysign-constant-magnitude.ll @@ -0,0 +1,105 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +define void @test_copysign_const_magnitude_d(double %X) { +; CHECK: [[SIGNMASK:L.+]]: +; CHECK-NEXT: .quad -9223372036854775808 ## double -0.000000e+00 +; CHECK-NEXT: .quad 0 ## double 0.000000e+00 +; CHECK: [[ZERO:L.+]]: +; CHECK-NEXT: .space 16 +; CHECK: [[ONE:L.+]]: +; CHECK-NEXT: .quad 4607182418800017408 ## double 1.000000e+00 +; CHECK-NEXT: .quad 0 ## double 0.000000e+00 +; CHECK-LABEL: test_copysign_const_magnitude_d: + +; CHECK: id + %iX = call double @id_d(double %X) + +; CHECK-NEXT: andpd [[SIGNMASK]](%rip), %xmm0 + %d0 = call double @copysign(double 0.000000e+00, double %iX) + +; CHECK-NEXT: id + %id0 = call double @id_d(double %d0) + +; CHECK-NEXT: andpd [[SIGNMASK]](%rip), %xmm0 +; CHECK-NEXT: orpd [[ZERO]](%rip), %xmm0 + %dn0 = call double @copysign(double -0.000000e+00, double %id0) + +; CHECK-NEXT: id + %idn0 = call double @id_d(double %dn0) + +; CHECK-NEXT: andpd [[SIGNMASK]](%rip), %xmm0 +; CHECK-NEXT: orpd [[ONE]](%rip), %xmm0 + %d1 = call double @copysign(double 1.000000e+00, double %idn0) + +; CHECK-NEXT: id + %id1 = call double @id_d(double %d1) + +; CHECK-NEXT: andpd [[SIGNMASK]](%rip), %xmm0 +; CHECK-NEXT: orpd [[ONE]](%rip), %xmm0 + %dn1 = call double @copysign(double -1.000000e+00, double %id1) + +; CHECK-NEXT: id + %idn1 = call double @id_d(double %dn1) + +; CHECK: retq + ret void +} + +define void @test_copysign_const_magnitude_f(float %X) { +; CHECK: [[SIGNMASK:L.+]]: +; CHECK-NEXT: .long 2147483648 ## float -0.000000e+00 +; CHECK-NEXT: .long 0 ## float 0.000000e+00 +; CHECK-NEXT: .long 0 ## float 0.000000e+00 +; CHECK-NEXT: .long 0 ## float 0.000000e+00 +; CHECK: [[ZERO:L.+]]: +; CHECK-NEXT: .space 16 +; CHECK: [[ONE:L.+]]: +; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00 +; CHECK-NEXT: .long 0 ## float 0.000000e+00 +; CHECK-NEXT: .long 0 ## float 0.000000e+00 +; CHECK-NEXT: .long 0 ## float 0.000000e+00 +; CHECK-LABEL: test_copysign_const_magnitude_f: + +; CHECK: id + %iX = call float @id_f(float %X) + +; CHECK-NEXT: andps [[SIGNMASK]](%rip), %xmm0 + %d0 = call float @copysignf(float 0.000000e+00, float %iX) + +; CHECK-NEXT: id + %id0 = call float @id_f(float %d0) + +; CHECK-NEXT: andps [[SIGNMASK]](%rip), %xmm0 +; CHECK-NEXT: orps [[ZERO]](%rip), %xmm0 + %dn0 = call float @copysignf(float -0.000000e+00, float %id0) + +; CHECK-NEXT: id + %idn0 = call float @id_f(float %dn0) + +; CHECK-NEXT: andps [[SIGNMASK]](%rip), %xmm0 +; CHECK-NEXT: orps [[ONE]](%rip), %xmm0 + %d1 = call float @copysignf(float 1.000000e+00, float %idn0) + +; CHECK-NEXT: id + %id1 = call float @id_f(float %d1) + +; CHECK-NEXT: andps [[SIGNMASK]](%rip), %xmm0 +; CHECK-NEXT: orps [[ONE]](%rip), %xmm0 + %dn1 = call float @copysignf(float -1.000000e+00, float %id1) + +; CHECK-NEXT: id + %idn1 = call float @id_f(float %dn1) + +; CHECK: retq + ret void +} + +declare double @copysign(double, double) nounwind readnone +declare float @copysignf(float, float) nounwind readnone + +; Dummy identity functions, so we always have xmm0, and prevent optimizations. +declare double @id_d(double) +declare float @id_f(float) diff --git a/test/CodeGen/X86/copysign-zero.ll b/test/CodeGen/X86/copysign-zero.ll deleted file mode 100644 index 47522d808058..000000000000 --- a/test/CodeGen/X86/copysign-zero.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s | not grep orpd -; RUN: llc < %s | grep andpd | count 1 - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" -target triple = "x86_64-apple-darwin8" - -define double @test(double %X) nounwind { -entry: - %tmp2 = tail call double @copysign( double 0.000000e+00, double %X ) nounwind readnone ; <double> [#uses=1] - ret double %tmp2 -} - -declare double @copysign(double, double) nounwind readnone - diff --git a/test/CodeGen/X86/cpus.ll b/test/CodeGen/X86/cpus.ll new file mode 100644 index 000000000000..ee1f7bb5295b --- /dev/null +++ b/test/CodeGen/X86/cpus.ll @@ -0,0 +1,35 @@ +; Test that the CPU names work. +; +; First ensure the error message matches what we expect. +; CHECK-ERROR: not a recognized processor for this target +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=foobar 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR +; +; Now ensure the error message doesn't occur for valid CPUs. +; CHECK-NO-ERROR-NOT: not a recognized processor for this target +; +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=nocona 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=core2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=penryn 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=nehalem 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=westmere 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=sandybridge 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=ivybridge 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=haswell 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=broadwell 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bonnell 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=silvermont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k8 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=opteron 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon64 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon-fx 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k8-sse3 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=opteron-sse3 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon64-sse3 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=amdfam10 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=barcelona 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bdver1 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bdver2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bdver3 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bdver4 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=btver1 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty +; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=btver2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty diff --git a/test/CodeGen/X86/crash-O0.ll b/test/CodeGen/X86/crash-O0.ll index 956d43b4e895..df8eaaf442b7 100644 --- a/test/CodeGen/X86/crash-O0.ll +++ b/test/CodeGen/X86/crash-O0.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 -relocation-model=pic -disable-fp-elim < %s +; RUN: llc -O0 -relocation-model=pic -disable-fp-elim < %s | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-apple-darwin10" @@ -29,3 +29,23 @@ entry: "41": ; preds = %"39" unreachable } + +; When using fast isel, sdiv is lowered into a sequence of CQO + DIV64. +; CQO defines implicitly AX and DIV64 uses it implicitly too. +; When an instruction gets between those two, RegAllocFast was reusing +; AX for the vreg defined in between and the compiler crashed. +; +; An instruction gets between CQO and DIV64 because the load is folded +; into the division but it requires a sign extension. +; PR21700 +; CHECK-LABEL: addressModeWith32bitIndex: +; CHECK: cqto +; CHECK-NEXT: movslq +; CHECK-NEXT: idivq +; CHECK: retq +define i64 @addressModeWith32bitIndex(i32 %V) { + %gep = getelementptr i64* null, i32 %V + %load = load i64* %gep + %sdiv = sdiv i64 0, %load + ret i64 %sdiv +} diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll index ee73377dffde..6b3dd3675750 100644 --- a/test/CodeGen/X86/crash.ll +++ b/test/CodeGen/X86/crash.ll @@ -108,8 +108,8 @@ do.body92: ; preds = %if.then66 ret void } -!0 = metadata !{i32 633550} -!1 = metadata !{i32 634261} +!0 = !{i32 633550} +!1 = !{i32 634261} ; Crash during XOR optimization. diff --git a/test/CodeGen/X86/critical-anti-dep-breaker.ll b/test/CodeGen/X86/critical-anti-dep-breaker.ll new file mode 100644 index 000000000000..32d3f49c79cc --- /dev/null +++ b/test/CodeGen/X86/critical-anti-dep-breaker.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic -post-RA-scheduler=1 -break-anti-dependencies=critical | FileCheck %s + +; PR20308 ( http://llvm.org/bugs/show_bug.cgi?id=20308 ) +; The critical-anti-dependency-breaker must not use register def information from a kill inst. +; This test case expects such an instruction to appear as a comment with def info for RDI. +; There is an anti-dependency (WAR) hazard using RAX using default reg allocation and scheduling. +; The post-RA-scheduler and critical-anti-dependency breaker can eliminate that hazard using R10. +; That is the first free register that isn't used as a param in the call to "@Image". + +@PartClass = external global i32 +@NullToken = external global i64 + +; CHECK-LABEL: Part_Create: +; CHECK-DAG: # kill: RDI<def> +; CHECK-DAG: movq PartClass@GOTPCREL(%rip), %r10 +define i32 @Part_Create(i64* %Anchor, i32 %TypeNum, i32 %F, i32 %Z, i32* %Status, i64* %PartTkn) { + %PartObj = alloca i64*, align 8 + %Vchunk = alloca i64, align 8 + %1 = load i64* @NullToken, align 4 + store i64 %1, i64* %Vchunk, align 8 + %2 = load i32* @PartClass, align 4 + call i32 @Image(i64* %Anchor, i32 %2, i32 0, i32 0, i32* %Status, i64* %PartTkn, i64** %PartObj) + call i32 @Create(i64* %Anchor) + ret i32 %2 +} + +declare i32 @Image(i64*, i32, i32, i32, i32*, i64*, i64**) +declare i32 @Create(i64*) diff --git a/test/CodeGen/X86/cttz-ctlz.ll b/test/CodeGen/X86/cttz-ctlz.ll new file mode 100644 index 000000000000..8717d4015954 --- /dev/null +++ b/test/CodeGen/X86/cttz-ctlz.ll @@ -0,0 +1,422 @@ +; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -mattr=+bmi < %s | FileCheck %s --check-prefix=ALL --check-prefix=BMI +; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown -mattr=+lzcnt < %s | FileCheck %s --check-prefix=ALL --check-prefix=LZCNT +; RUN: opt -S -codegenprepare -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=ALL --check-prefix=GENERIC + + +define i64 @test1(i64 %A) { +; ALL-LABEL: @test1( +; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i64 @llvm.ctlz.i64(i64 %A, i1 false) +; LZCNT-NEXT: ret i64 [[CTLZ]] +; BMI: icmp eq i64 %A, 0 +; BMI: call i64 @llvm.ctlz.i64(i64 %A, i1 true) +; GENERIC: icmp eq i64 %A, 0 +; GENERIC: call i64 @llvm.ctlz.i64(i64 %A, i1 true) +entry: + %tobool = icmp eq i64 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ] + ret i64 %cond +} + + +define i32 @test2(i32 %A) { +; ALL-LABEL: @test2( +; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i32 @llvm.ctlz.i32(i32 %A, i1 false) +; LZCNT-NEXT: ret i32 [[CTLZ]] +; BMI: icmp eq i32 %A, 0 +; BMI: call i32 @llvm.ctlz.i32(i32 %A, i1 true) +; GENERIC: icmp eq i32 %A, 0 +; GENERIC: call i32 @llvm.ctlz.i32(i32 %A, i1 true) +entry: + %tobool = icmp eq i32 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ] + ret i32 %cond +} + + +define signext i16 @test3(i16 signext %A) { +; ALL-LABEL: @test3( +; LZCNT: [[CTLZ:%[A-Za-z0-9]+]] = call i16 @llvm.ctlz.i16(i16 %A, i1 false) +; LZCNT-NEXT: ret i16 [[CTLZ]] +; BMI: icmp eq i16 %A, 0 +; BMI: call i16 @llvm.ctlz.i16(i16 %A, i1 true) +; GENERIC: icmp eq i16 %A, 0 +; GENERIC: call i16 @llvm.ctlz.i16(i16 %A, i1 true) +entry: + %tobool = icmp eq i16 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ] + ret i16 %cond +} + + +define i64 @test1b(i64 %A) { +; ALL-LABEL: @test1b( +; LZCNT: icmp eq i64 %A, 0 +; LZCNT: call i64 @llvm.cttz.i64(i64 %A, i1 true) +; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i64 @llvm.cttz.i64(i64 %A, i1 false) +; BMI-NEXT: ret i64 [[CTTZ]] +; GENERIC: icmp eq i64 %A, 0 +; GENERIC: call i64 @llvm.cttz.i64(i64 %A, i1 true) +entry: + %tobool = icmp eq i64 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %0, %cond.true ], [ 64, %entry ] + ret i64 %cond +} + + +define i32 @test2b(i32 %A) { +; ALL-LABEL: @test2b( +; LZCNT: icmp eq i32 %A, 0 +; LZCNT: call i32 @llvm.cttz.i32(i32 %A, i1 true) +; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i32 @llvm.cttz.i32(i32 %A, i1 false) +; BMI-NEXT: ret i32 [[CTTZ]] +; GENERIC: icmp eq i32 %A, 0 +; GENERIC: call i32 @llvm.cttz.i32(i32 %A, i1 true) +entry: + %tobool = icmp eq i32 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %0, %cond.true ], [ 32, %entry ] + ret i32 %cond +} + + +define signext i16 @test3b(i16 signext %A) { +; ALL-LABEL: @test3b( +; LZCNT: icmp eq i16 %A, 0 +; LZCNT: call i16 @llvm.cttz.i16(i16 %A, i1 true) +; BMI: [[CTTZ:%[A-Za-z0-9]+]] = call i16 @llvm.cttz.i16(i16 %A, i1 false) +; BMI-NEXT: ret i16 [[CTTZ]] +; GENERIC: icmp eq i16 %A, 0 +; GENERIC: call i16 @llvm.cttz.i16(i16 %A, i1 true) +entry: + %tobool = icmp eq i16 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %0, %cond.true ], [ 16, %entry ] + ret i16 %cond +} + + +define i64 @test1c(i64 %A) { +; ALL-LABEL: @test1c( +; ALL: icmp eq i64 %A, 0 +; ALL: call i64 @llvm.ctlz.i64(i64 %A, i1 true) +entry: + %tobool = icmp eq i64 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.ctlz.i64(i64 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %0, %cond.true ], [ 63, %entry ] + ret i64 %cond +} + +define i32 @test2c(i32 %A) { +; ALL-LABEL: @test2c( +; ALL: icmp eq i32 %A, 0 +; ALL: call i32 @llvm.ctlz.i32(i32 %A, i1 true) +entry: + %tobool = icmp eq i32 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.ctlz.i32(i32 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %0, %cond.true ], [ 31, %entry ] + ret i32 %cond +} + + +define signext i16 @test3c(i16 signext %A) { +; ALL-LABEL: @test3c( +; ALL: icmp eq i16 %A, 0 +; ALL: call i16 @llvm.ctlz.i16(i16 %A, i1 true) +entry: + %tobool = icmp eq i16 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i16 @llvm.ctlz.i16(i16 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %0, %cond.true ], [ 15, %entry ] + ret i16 %cond +} + + +define i64 @test1d(i64 %A) { +; ALL-LABEL: @test1d( +; ALL: icmp eq i64 %A, 0 +; ALL: call i64 @llvm.cttz.i64(i64 %A, i1 true) +entry: + %tobool = icmp eq i64 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.cttz.i64(i64 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %0, %cond.true ], [ 63, %entry ] + ret i64 %cond +} + + +define i32 @test2d(i32 %A) { +; ALL-LABEL: @test2d( +; ALL: icmp eq i32 %A, 0 +; ALL: call i32 @llvm.cttz.i32(i32 %A, i1 true) +entry: + %tobool = icmp eq i32 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.cttz.i32(i32 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %0, %cond.true ], [ 31, %entry ] + ret i32 %cond +} + + +define signext i16 @test3d(i16 signext %A) { +; ALL-LABEL: @test3d( +; ALL: icmp eq i16 %A, 0 +; ALL: call i16 @llvm.cttz.i16(i16 %A, i1 true) +entry: + %tobool = icmp eq i16 %A, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i16 @llvm.cttz.i16(i16 %A, i1 true) + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %0, %cond.true ], [ 15, %entry ] + ret i16 %cond +} + +; The following tests verify that calls to cttz/ctlz are speculated even if +; basic block %cond.true has an extra zero extend/truncate which is "free" +; for the target. + +define i64 @test1e(i32 %x) { +; ALL-LABEL: @test1e( +; LZCNT: icmp eq i32 %x, 0 +; LZCNT: call i32 @llvm.cttz.i32(i32 %x, i1 true) +; BMI: call i32 @llvm.cttz.i32(i32 %x, i1 false) +; GENERIC: icmp eq i32 %x, 0 +; GENERIC: call i32 @llvm.cttz.i32(i32 %x, i1 true) +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true) + %phitmp2 = zext i32 %0 to i64 + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %phitmp2, %cond.true ], [ 32, %entry ] + ret i64 %cond +} + +define i32 @test2e(i64 %x) { +; ALL-LABEL: @test2e( +; LZCNT: icmp eq i64 %x, 0 +; LZCNT: call i64 @llvm.cttz.i64(i64 %x, i1 true) +; BMI: call i64 @llvm.cttz.i64(i64 %x, i1 false) +; GENERIC: icmp eq i64 %x, 0 +; GENERIC: call i64 @llvm.cttz.i64(i64 %x, i1 true) +entry: + %tobool = icmp eq i64 %x, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true) + %cast = trunc i64 %0 to i32 + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %cast, %cond.true ], [ 64, %entry ] + ret i32 %cond +} + +define i64 @test3e(i32 %x) { +; ALL-LABEL: @test3e( +; BMI: icmp eq i32 %x, 0 +; BMI: call i32 @llvm.ctlz.i32(i32 %x, i1 true) +; LZCNT: call i32 @llvm.ctlz.i32(i32 %x, i1 false) +; GENERIC: icmp eq i32 %x, 0 +; GENERIC: call i32 @llvm.ctlz.i32(i32 %x, i1 true) +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true) + %phitmp2 = zext i32 %0 to i64 + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %phitmp2, %cond.true ], [ 32, %entry ] + ret i64 %cond +} + +define i32 @test4e(i64 %x) { +; ALL-LABEL: @test4e( +; BMI: icmp eq i64 %x, 0 +; BMI: call i64 @llvm.ctlz.i64(i64 %x, i1 true) +; LZCNT: call i64 @llvm.ctlz.i64(i64 %x, i1 false) +; GENERIC: icmp eq i64 %x, 0 +; GENERIC: call i64 @llvm.ctlz.i64(i64 %x, i1 true) +entry: + %tobool = icmp eq i64 %x, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true) + %cast = trunc i64 %0 to i32 + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i32 [ %cast, %cond.true ], [ 64, %entry ] + ret i32 %cond +} + +define i16 @test5e(i64 %x) { +; ALL-LABEL: @test5e( +; BMI: icmp eq i64 %x, 0 +; BMI: call i64 @llvm.ctlz.i64(i64 %x, i1 true) +; LZCNT: call i64 @llvm.ctlz.i64(i64 %x, i1 false) +; GENERIC: icmp eq i64 %x, 0 +; GENERIC: call i64 @llvm.ctlz.i64(i64 %x, i1 true) +entry: + %tobool = icmp eq i64 %x, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true) + %cast = trunc i64 %0 to i16 + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %cast, %cond.true ], [ 64, %entry ] + ret i16 %cond +} + +define i16 @test6e(i32 %x) { +; ALL-LABEL: @test6e( +; BMI: icmp eq i32 %x, 0 +; BMI: call i32 @llvm.ctlz.i32(i32 %x, i1 true) +; LZCNT: call i32 @llvm.ctlz.i32(i32 %x, i1 false) +; GENERIC: icmp eq i32 %x, 0 +; GENERIC: call i32 @llvm.ctlz.i32(i32 %x, i1 true) +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true) + %cast = trunc i32 %0 to i16 + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %cast, %cond.true ], [ 32, %entry ] + ret i16 %cond +} + +define i16 @test7e(i64 %x) { +; ALL-LABEL: @test7e( +; LZCNT: icmp eq i64 %x, 0 +; LZCNT: call i64 @llvm.cttz.i64(i64 %x, i1 true) +; BMI: call i64 @llvm.cttz.i64(i64 %x, i1 false) +; GENERIC: icmp eq i64 %x, 0 +; GENERIC: call i64 @llvm.cttz.i64(i64 %x, i1 true) +entry: + %tobool = icmp eq i64 %x, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true) + %cast = trunc i64 %0 to i16 + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %cast, %cond.true ], [ 64, %entry ] + ret i16 %cond +} + +define i16 @test8e(i32 %x) { +; ALL-LABEL: @test8e( +; LZCNT: icmp eq i32 %x, 0 +; LZCNT: call i32 @llvm.cttz.i32(i32 %x, i1 true) +; BMI: call i32 @llvm.cttz.i32(i32 %x, i1 false) +; GENERIC: icmp eq i32 %x, 0 +; GENERIC: call i32 @llvm.cttz.i32(i32 %x, i1 true) +entry: + %tobool = icmp eq i32 %x, 0 + br i1 %tobool, label %cond.end, label %cond.true + +cond.true: ; preds = %entry + %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true) + %cast = trunc i32 %0 to i16 + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i16 [ %cast, %cond.true ], [ 32, %entry ] + ret i16 %cond +} + + +declare i64 @llvm.ctlz.i64(i64, i1) +declare i32 @llvm.ctlz.i32(i32, i1) +declare i16 @llvm.ctlz.i16(i16, i1) +declare i64 @llvm.cttz.i64(i64, i1) +declare i32 @llvm.cttz.i32(i32, i1) +declare i16 @llvm.cttz.i16(i16, i1) diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll index 23f83352eb2e..16d8f97c3a21 100644 --- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll +++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll @@ -52,58 +52,153 @@ define void @_Z3barii(i32 %param1, i32 %param2) #0 { entry: %var1 = alloca %struct.AAA3, align 1 %var2 = alloca %struct.AAA3, align 1 - %tobool = icmp eq i32 %param2, 0 - br i1 %tobool, label %if.end, label %if.then + tail call void @llvm.dbg.value(metadata i32 %param1, i64 0, metadata !30, metadata !{!"0x102"}), !dbg !47 + tail call void @llvm.dbg.value(metadata i32 %param2, i64 0, metadata !31, metadata !{!"0x102"}), !dbg !47 + tail call void @llvm.dbg.value(metadata i8* null, i64 0, metadata !32, metadata !{!"0x102"}), !dbg !49 + %tobool = icmp eq i32 %param2, 0, !dbg !50 + br i1 %tobool, label %if.end, label %if.then, !dbg !50 if.then: ; preds = %entry - %call = call i8* @_Z5i2stri(i32 %param2) - br label %if.end + %call = tail call i8* @_Z5i2stri(i32 %param2), !dbg !52 + tail call void @llvm.dbg.value(metadata i8* %call, i64 0, metadata !32, metadata !{!"0x102"}), !dbg !49 + br label %if.end, !dbg !54 if.end: ; preds = %entry, %if.then - call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !60) - call void @llvm.dbg.value(metadata !62, i64 0, metadata !63) - %arraydecay.i = getelementptr inbounds %struct.AAA3* %var1, i64 0, i32 0, i64 0 - call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)) - call void @llvm.dbg.declare(metadata !{%struct.AAA3* %var2}, metadata !38) - %arraydecay.i5 = getelementptr inbounds %struct.AAA3* %var2, i64 0, i32 0, i64 0 - call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)) - %tobool1 = icmp eq i32 %param1, 0 - br i1 %tobool1, label %if.else, label %if.then2 + tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !33, metadata !{!"0x102"}), !dbg !55 + tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !56, metadata !{!"0x102"}), !dbg !57 + tail call void @llvm.dbg.value(metadata !58, i64 0, metadata !59, metadata !{!"0x102"}), !dbg !60 + %arraydecay.i = getelementptr inbounds %struct.AAA3* %var1, i64 0, i32 0, i64 0, !dbg !61 + call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !61 + call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !34, metadata !{!"0x102"}), !dbg !63 + call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !64, metadata !{!"0x102"}), !dbg !65 + call void @llvm.dbg.value(metadata !58, i64 0, metadata !66, metadata !{!"0x102"}), !dbg !67 + %arraydecay.i5 = getelementptr inbounds %struct.AAA3* %var2, i64 0, i32 0, i64 0, !dbg !68 + call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !68 + %tobool1 = icmp eq i32 %param1, 0, !dbg !69 + call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !34, metadata !{!"0x102"}), !dbg !63 + br i1 %tobool1, label %if.else, label %if.then2, !dbg !69 if.then2: ; preds = %if.end - call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)) - br label %if.end3 + call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !71, metadata !{!"0x102"}), !dbg !73 + call void @llvm.dbg.value(metadata !74, i64 0, metadata !75, metadata !{!"0x102"}), !dbg !76 + call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)), !dbg !76 + br label %if.end3, !dbg !72 if.else: ; preds = %if.end - call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)) + call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !77, metadata !{!"0x102"}), !dbg !79 + call void @llvm.dbg.value(metadata !80, i64 0, metadata !81, metadata !{!"0x102"}), !dbg !82 + call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)), !dbg !82 br label %if.end3 if.end3: ; preds = %if.else, %if.then2 - call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)) - ret void + call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !33, metadata !{!"0x102"}), !dbg !55 + call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !83, metadata !{!"0x102"}), !dbg !85 + call void @llvm.dbg.value(metadata !58, i64 0, metadata !86, metadata !{!"0x102"}), !dbg !87 + call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)), !dbg !87 + ret void, !dbg !88 } -; Function Attrs: nounwind readnone -declare void @llvm.dbg.declare(metadata, metadata) #1 - -declare i8* @_Z5i2stri(i32) #2 +declare i8* @_Z5i2stri(i32) #1 -declare void @_Z3fooPcjPKc(i8*, i32, i8*) #2 +declare void @_Z3fooPcjPKc(i8*, i32, i8*) #1 ; Function Attrs: nounwind readnone -declare void @llvm.dbg.value(metadata, i64, metadata) #1 +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2 attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind readnone } -attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } - -!llvm.module.flags = !{!48, !49} -!llvm.ident = !{!50} - -!38 = metadata !{i32 786688, null, metadata !"var2", null, i32 20, null, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var2] [line 20] -!48 = metadata !{i32 2, metadata !"Dwarf Version", i32 4} -!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} -!50 = metadata !{metadata !"clang version 3.5 (202418)"} -!60 = metadata !{i32 786689, null, metadata !"this", null, i32 16777216, null, i32 1088, null} ; [ DW_TAG_arg_variable ] [this] [line 0] -!62 = metadata !{i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)} -!63 = metadata !{i32 786689, null, metadata !"value", null, i32 33554439, null, i32 0, null} ; [ DW_TAG_arg_variable ] [value] [line 7] +attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!44, !45} +!llvm.ident = !{!46} + +!0 = !{!"0x11\004\00clang version 3.5.0 \001\00\000\00\001", !1, !2, !3, !23, !2, !2} ; [ DW_TAG_compile_unit ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] [DW_LANG_C_plus_plus] +!1 = !{!"dbg-changes-codegen-branch-folding.cpp", !"/tmp/dbginfo"} +!2 = !{} +!3 = !{!4} +!4 = !{!"0x13\00AAA3\004\0032\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS4AAA3"} ; [ DW_TAG_structure_type ] [AAA3] [line 4, size 32, align 8, offset 0] [def] [from ] +!5 = !{!6, !11, !17, !18} +!6 = !{!"0xd\00text\008\0032\008\000\000", !1, !"_ZTS4AAA3", !7} ; [ DW_TAG_member ] [text] [line 8, size 32, align 8, offset 0] [from ] +!7 = !{!"0x1\00\000\0032\008\000\000", null, null, !8, !9, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 32, align 8, offset 0] [from char] +!8 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char] +!9 = !{!10} +!10 = !{!"0x21\000\004"} ; [ DW_TAG_subrange_type ] [0, 3] +!11 = !{!"0x2e\00AAA3\00AAA3\00\005\000\000\000\006\00256\001\005", !1, !"_ZTS4AAA3", !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 5] [AAA3] +!12 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !13, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!13 = !{null, !14, !15} +!14 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS4AAA3] +!15 = !{!"0xf\00\000\0064\0064\000\000", null, null, !16} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ] +!16 = !{!"0x26\00\000\000\000\000\000", null, null, !8} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from char] +!17 = !{!"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\000\000\006\00256\001\006", !1, !"_ZTS4AAA3", !12, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 6] [operator=] +!18 = !{!"0x2e\00operator const char *\00operator const char *\00_ZNK4AAA3cvPKcEv\007\000\000\000\006\00256\001\007", !1, !"_ZTS4AAA3", !19, null, null, null, i32 0, null} ; [ DW_TAG_subprogram ] [line 7] [operator const char *] +!19 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !20, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!20 = !{!15, !21} +!21 = !{!"0xf\00\000\0064\0064\000\001088", null, null, !22} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from ] +!22 = !{!"0x26\00\000\000\000\000\000", null, null, !"_ZTS4AAA3"} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from _ZTS4AAA3] +!23 = !{!24, !35, !40} +!24 = !{!"0x2e\00bar\00bar\00_Z3barii\0011\000\001\000\006\00256\001\0011", !1, !25, !26, null, void (i32, i32)* @_Z3barii, null, null, !29} ; [ DW_TAG_subprogram ] [line 11] [def] [bar] +!25 = !{!"0x29", !1} ; [ DW_TAG_file_type ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] +!26 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !27, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!27 = !{null, !28, !28} +!28 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed] +!29 = !{!30, !31, !32, !33, !34} +!30 = !{!"0x101\00param1\0016777227\000", !24, !25, !28} ; [ DW_TAG_arg_variable ] [param1] [line 11] +!31 = !{!"0x101\00param2\0033554443\000", !24, !25, !28} ; [ DW_TAG_arg_variable ] [param2] [line 11] +!32 = !{!"0x100\00temp\0012\000", !24, !25, !15} ; [ DW_TAG_auto_variable ] [temp] [line 12] +!33 = !{!"0x100\00var1\0017\000", !24, !25, !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var1] [line 17] +!34 = !{!"0x100\00var2\0018\000", !24, !25, !"_ZTS4AAA3"} ; [ DW_TAG_auto_variable ] [var2] [line 18] +!35 = !{!"0x2e\00operator=\00operator=\00_ZN4AAA3aSEPKc\006\000\001\000\006\00256\001\006", !1, !"_ZTS4AAA3", !12, null, null, null, !17, !36} ; [ DW_TAG_subprogram ] [line 6] [def] [operator=] +!36 = !{!37, !39} +!37 = !{!"0x101\00this\0016777216\001088", !35, null, !38} ; [ DW_TAG_arg_variable ] [this] [line 0] +!38 = !{!"0xf\00\000\0064\0064\000\000", null, null, !"_ZTS4AAA3"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _ZTS4AAA3] +!39 = !{!"0x101\00value\0033554438\000", !35, !25, !15} ; [ DW_TAG_arg_variable ] [value] [line 6] +!40 = !{!"0x2e\00AAA3\00AAA3\00_ZN4AAA3C2EPKc\005\000\001\000\006\00256\001\005", !1, !"_ZTS4AAA3", !12, null, null, null, !11, !41} ; [ DW_TAG_subprogram ] [line 5] [def] [AAA3] +!41 = !{!42, !43} +!42 = !{!"0x101\00this\0016777216\001088", !40, null, !38} ; [ DW_TAG_arg_variable ] [this] [line 0] +!43 = !{!"0x101\00value\0033554437\000", !40, !25, !15} ; [ DW_TAG_arg_variable ] [value] [line 5] +!44 = !{i32 2, !"Dwarf Version", i32 4} +!45 = !{i32 2, !"Debug Info Version", i32 2} +!46 = !{!"clang version 3.5.0 "} +!47 = !MDLocation(line: 11, scope: !24) +!48 = !{i8* null} +!49 = !MDLocation(line: 12, scope: !24) +!50 = !MDLocation(line: 14, scope: !51) +!51 = !{!"0xb\0014\000\000", !1, !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] +!52 = !MDLocation(line: 15, scope: !53) +!53 = !{!"0xb\0014\000\000", !1, !51} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] +!54 = !MDLocation(line: 16, scope: !53) +!55 = !MDLocation(line: 17, scope: !24) +!56 = !{!"0x101\00this\0016777216\001088", !40, null, !38, !55} ; [ DW_TAG_arg_variable ] [this] [line 0] +!57 = !MDLocation(line: 0, scope: !40, inlinedAt: !55) +!58 = !{i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)} +!59 = !{!"0x101\00value\0033554437\000", !40, !25, !15, !55} ; [ DW_TAG_arg_variable ] [value] [line 5] +!60 = !MDLocation(line: 5, scope: !40, inlinedAt: !55) +!61 = !MDLocation(line: 5, scope: !62, inlinedAt: !55) +!62 = !{!"0xb\005\000\000", !1, !40} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] +!63 = !MDLocation(line: 18, scope: !24) +!64 = !{!"0x101\00this\0016777216\001088", !40, null, !38, !63} ; [ DW_TAG_arg_variable ] [this] [line 0] +!65 = !MDLocation(line: 0, scope: !40, inlinedAt: !63) +!66 = !{!"0x101\00value\0033554437\000", !40, !25, !15, !63} ; [ DW_TAG_arg_variable ] [value] [line 5] +!67 = !MDLocation(line: 5, scope: !40, inlinedAt: !63) +!68 = !MDLocation(line: 5, scope: !62, inlinedAt: !63) +!69 = !MDLocation(line: 20, scope: !70) +!70 = !{!"0xb\0020\000\000", !1, !24} ; [ DW_TAG_lexical_block ] [/tmp/dbginfo/dbg-changes-codegen-branch-folding.cpp] +!71 = !{!"0x101\00this\0016777216\001088", !35, null, !38, !72} ; [ DW_TAG_arg_variable ] [this] [line 0] +!72 = !MDLocation(line: 21, scope: !70) +!73 = !MDLocation(line: 0, scope: !35, inlinedAt: !72) +!74 = !{i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0)} +!75 = !{!"0x101\00value\0033554438\000", !35, !25, !15, !72} ; [ DW_TAG_arg_variable ] [value] [line 6] +!76 = !MDLocation(line: 6, scope: !35, inlinedAt: !72) +!77 = !{!"0x101\00this\0016777216\001088", !35, null, !38, !78} ; [ DW_TAG_arg_variable ] [this] [line 0] +!78 = !MDLocation(line: 23, scope: !70) +!79 = !MDLocation(line: 0, scope: !35, inlinedAt: !78) +!80 = !{i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0)} +!81 = !{!"0x101\00value\0033554438\000", !35, !25, !15, !78} ; [ DW_TAG_arg_variable ] [value] [line 6] +!82 = !MDLocation(line: 6, scope: !35, inlinedAt: !78) +!83 = !{!"0x101\00this\0016777216\001088", !35, null, !38, !84} ; [ DW_TAG_arg_variable ] [this] [line 0] +!84 = !MDLocation(line: 24, scope: !24) +!85 = !MDLocation(line: 0, scope: !35, inlinedAt: !84) +!86 = !{!"0x101\00value\0033554438\000", !35, !25, !15, !84} ; [ DW_TAG_arg_variable ] [value] [line 6] +!87 = !MDLocation(line: 6, scope: !35, inlinedAt: !84) +!88 = !MDLocation(line: 25, scope: !24) diff --git a/test/CodeGen/X86/dbg-changes-codegen.ll b/test/CodeGen/X86/dbg-changes-codegen.ll index 0b17c455408b..2179667245f1 100644 --- a/test/CodeGen/X86/dbg-changes-codegen.ll +++ b/test/CodeGen/X86/dbg-changes-codegen.ll @@ -44,7 +44,7 @@ define zeroext i1 @_ZN3Foo3batEv(%struct.Foo* %this) #0 align 2 { entry: %0 = load %struct.Foo** @pfoo, align 8 - tail call void @llvm.dbg.value(metadata !{%struct.Foo* %0}, i64 0, metadata !62) + tail call void @llvm.dbg.value(metadata %struct.Foo* %0, i64 0, metadata !62, metadata !{!"0x102"}) %cmp.i = icmp eq %struct.Foo* %0, %this ret i1 %cmp.i } @@ -53,7 +53,7 @@ entry: define void @_Z3bazv() #1 { entry: %0 = load %struct.Wibble** @wibble1, align 8 - tail call void @llvm.dbg.value(metadata !64, i64 0, metadata !65) + tail call void @llvm.dbg.value(metadata %struct.Flibble* undef, i64 0, metadata !65, metadata !{!"0x102"}) %1 = load %struct.Wibble** @wibble2, align 8 %cmp.i = icmp ugt %struct.Wibble* %1, %0 br i1 %cmp.i, label %if.then.i, label %_ZN7Flibble3barEP6Wibble.exit @@ -69,15 +69,15 @@ _ZN7Flibble3barEP6Wibble.exit: ; preds = %entry, %if.then.i } ; Function Attrs: nounwind readnone -declare void @llvm.dbg.value(metadata, i64, metadata) #2 +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2 attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #2 = { nounwind readnone } -!17 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo] -!45 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble] -!62 = metadata !{i32 786689, null, metadata !"arg", null, i32 33554436, metadata !17, i32 0, null} ; [ DW_TAG_arg_variable ] [arg] [line 4] -!64 = metadata !{%struct.Flibble* undef} -!65 = metadata !{i32 786689, null, metadata !"this", null, i32 16777229, metadata !45, i32 1088, null} ; [ DW_TAG_arg_variable ] [this] [line 13] +!17 = !{!"0x10\00\000\000\000\000\000", null, null, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo] +!45 = !{!"0xf\00\000\0064\0064\000\000", null, null, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble] +!62 = !{!"0x101\00arg\0033554436\000", null, null, !17} ; [ DW_TAG_arg_variable ] [arg] [line 4] +!64 = !{%struct.Flibble* undef} +!65 = !{!"0x101\00this\0016777229\001088", null, null, !45} ; [ DW_TAG_arg_variable ] [this] [line 13] diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll index 21225e340826..fd07a3f55100 100644 --- a/test/CodeGen/X86/divide-by-constant.ll +++ b/test/CodeGen/X86/divide-by-constant.ll @@ -31,6 +31,7 @@ entry: ; CHECK-LABEL: test3: ; CHECK: movzbl 8(%esp), %eax ; CHECK-NEXT: imull $171, %eax +; CHECK-NEXT: andl $65024, %eax ; CHECK-NEXT: shrl $9, %eax ; CHECK-NEXT: ret } @@ -56,9 +57,10 @@ entry: %div = sdiv i16 %x, 10 ret i16 %div ; CHECK-LABEL: test6: -; CHECK: imull $26215, %eax, %ecx -; CHECK: sarl $18, %ecx -; CHECK: shrl $15, %eax +; CHECK: imull $26215, %eax +; CHECK: movl %eax, %ecx +; CHECK: shrl $31, %ecx +; CHECK: sarl $18, %eax } define i32 @test7(i32 %x) nounwind { diff --git a/test/CodeGen/X86/divrem8_ext.ll b/test/CodeGen/X86/divrem8_ext.ll new file mode 100644 index 000000000000..ec367c86526d --- /dev/null +++ b/test/CodeGen/X86/divrem8_ext.ll @@ -0,0 +1,100 @@ +; RUN: llc -march=x86-64 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-64 +; RUN: llc -march=x86 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-32 +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +define zeroext i8 @test_udivrem_zext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_udivrem_zext_ah +; CHECK: divb +; CHECK: movzbl %ah, [[REG_REM:%[a-z0-9]+]] +; CHECK: movb %al, ([[REG_ZPTR:%[a-z0-9]+]]) +; CHECK: movl [[REG_REM]], %eax +; CHECK: ret + %div = udiv i8 %x, %y + store i8 %div, i8* @z + %1 = urem i8 %x, %y + ret i8 %1 +} + +define zeroext i8 @test_urem_zext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_urem_zext_ah +; CHECK: divb +; CHECK: movzbl %ah, %eax +; CHECK: ret + %1 = urem i8 %x, %y + ret i8 %1 +} + +define i8 @test_urem_noext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_urem_noext_ah +; CHECK: divb [[REG_X:%[a-z0-9]+]] +; CHECK: movzbl %ah, %eax +; CHECK: addb [[REG_X]], %al +; CHECK: ret + %1 = urem i8 %x, %y + %2 = add i8 %1, %y + ret i8 %2 +} + +define i64 @test_urem_zext64_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_urem_zext64_ah +; CHECK: divb +; CHECK: movzbl %ah, %eax +; CHECK-32: xorl %edx, %edx +; CHECK: ret + %1 = urem i8 %x, %y + %2 = zext i8 %1 to i64 + ret i64 %2 +} + +define signext i8 @test_sdivrem_sext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_sdivrem_sext_ah +; CHECK: cbtw +; CHECK: idivb +; CHECK: movsbl %ah, [[REG_REM:%[a-z0-9]+]] +; CHECK: movb %al, ([[REG_ZPTR]]) +; CHECK: movl [[REG_REM]], %eax +; CHECK: ret + %div = sdiv i8 %x, %y + store i8 %div, i8* @z + %1 = srem i8 %x, %y + ret i8 %1 +} + +define signext i8 @test_srem_sext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_srem_sext_ah +; CHECK: cbtw +; CHECK: idivb +; CHECK: movsbl %ah, %eax +; CHECK: ret + %1 = srem i8 %x, %y + ret i8 %1 +} + +define i8 @test_srem_noext_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_srem_noext_ah +; CHECK: cbtw +; CHECK: idivb [[REG_X:%[a-z0-9]+]] +; CHECK: movsbl %ah, %eax +; CHECK: addb [[REG_X]], %al +; CHECK: ret + %1 = srem i8 %x, %y + %2 = add i8 %1, %y + ret i8 %2 +} + +define i64 @test_srem_sext64_ah(i8 %x, i8 %y) { +; CHECK-LABEL: test_srem_sext64_ah +; CHECK: cbtw +; CHECK: idivb +; CHECK: movsbl %ah, %eax +; CHECK-32: movl %eax, %edx +; CHECK-32: sarl $31, %edx +; CHECK-64: movsbq %al, %rax +; CHECK: ret + %1 = srem i8 %x, %y + %2 = sext i8 %1 to i64 + ret i64 %2 +} + +@z = external global i8 diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll index 0d5afa1b1384..c673f5d485f9 100644 --- a/test/CodeGen/X86/dllexport-x86_64.ll +++ b/test/CodeGen/X86/dllexport-x86_64.ll @@ -70,7 +70,7 @@ define weak_odr dllexport void @weak1() { ; CHECK: .weak weak_alias ; CHECK: weak_alias = f1 -@weak_alias = dllexport alias weak_odr void()* @f1 +@weak_alias = weak_odr dllexport alias void()* @f1 @blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16 @blob_alias = dllexport alias bitcast ([6 x i8]* @blob to i32 ()*) diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll index e2c3f131ee06..5035aa153301 100644 --- a/test/CodeGen/X86/dllexport.ll +++ b/test/CodeGen/X86/dllexport.ll @@ -89,7 +89,7 @@ define weak_odr dllexport void @weak1() { ; CHECK: .weak _weak_alias ; CHECK: _weak_alias = _f1 -@weak_alias = dllexport alias weak_odr void()* @f1 +@weak_alias = weak_odr dllexport alias void()* @f1 ; CHECK: .section .drectve diff --git a/test/CodeGen/X86/dllimport-x86_64.ll b/test/CodeGen/X86/dllimport-x86_64.ll index 666409fd4c07..839bca4f3c31 100644 --- a/test/CodeGen/X86/dllimport-x86_64.ll +++ b/test/CodeGen/X86/dllimport-x86_64.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple x86_64-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST ; PR6275 ; -; RUN: opt -mtriple x86_64-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT +; RUN: opt -mtriple x86_64-pc-win32 -O3 -S < %s | FileCheck %s -check-prefix=OPT @Var1 = external dllimport global i32 @Var2 = available_externally dllimport unnamed_addr constant i32 1 diff --git a/test/CodeGen/X86/dllimport.ll b/test/CodeGen/X86/dllimport.ll index 695bfce821bb..231ad65740b8 100644 --- a/test/CodeGen/X86/dllimport.ll +++ b/test/CodeGen/X86/dllimport.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple i386-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST ; PR6275 ; -; RUN: opt -mtriple i386-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT +; RUN: opt -mtriple i386-pc-win32 -O3 -S < %s | FileCheck %s -check-prefix=OPT @Var1 = external dllimport global i32 @Var2 = available_externally dllimport unnamed_addr constant i32 1 diff --git a/test/CodeGen/X86/dont-trunc-store-double-to-float.ll b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll new file mode 100644 index 000000000000..24d9533eba4a --- /dev/null +++ b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll @@ -0,0 +1,20 @@ +; RUN: llc -march=x86 < %s | FileCheck %s + +; CHECK-LABEL: @bar +; CHECK: movl $1074339512, +; CHECK: movl $1374389535, +; CHECK: movl $1078523331, +define void @bar() unnamed_addr { +entry-block: + %a = alloca double + %b = alloca float + + store double 3.140000e+00, double* %a + %0 = load double* %a + + %1 = fptrunc double %0 to float + + store float %1, float* %b + + ret void +} diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll index c8d752771044..77eba63a83ec 100644 --- a/test/CodeGen/X86/dwarf-comp-dir.ll +++ b/test/CodeGen/X86/dwarf-comp-dir.ll @@ -7,15 +7,15 @@ target triple = "x86_64-unknown-linux-gnu" !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!5} -!0 = metadata !{i32 720913, metadata !4, i32 12, metadata !"clang version 3.1 (trunk 143523)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] -!2 = metadata !{} -!3 = metadata !{i32 786473, metadata !4} ; [ DW_TAG_file_type ] -!4 = metadata !{metadata !"empty.c", metadata !"/home/nlewycky"} -!6 = metadata !{i32 786451, metadata !4, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ] -!7 = metadata !{metadata !6} +!0 = !{!"0x11\0012\00clang version 3.1 (trunk 143523)\001\00\000\00\000", !4, !2, !7, !2, !2, null} ; [ DW_TAG_compile_unit ] +!2 = !{} +!3 = !{!"0x29", !4} ; [ DW_TAG_file_type ] +!4 = !{!"empty.c", !"/home/nlewycky"} +!6 = !{!"0x13\00foo\001\008\008\000\000\000", !4, null, null, !2, null, null, !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ] +!7 = !{!6} ; The important part of the following check is that dir = #0. ; Dir Mod Time File Len File Name ; ---- ---------- ---------- --------------------------- ; CHECK: file_names[ 1] 0 0x00000000 0x00000000 empty.c -!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!5 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/dynamic-alloca-lifetime.ll b/test/CodeGen/X86/dynamic-alloca-lifetime.ll new file mode 100644 index 000000000000..f019bed858c2 --- /dev/null +++ b/test/CodeGen/X86/dynamic-alloca-lifetime.ll @@ -0,0 +1,44 @@ +; RUN: llc -no-stack-coloring=false < %s | FileCheck %s + +; This test crashed in PEI because the stack protector was dead. +; This was due to it being colored, which was in turn due to incorrect +; lifetimes being applied to the stack protector frame index. + +; CHECK: stack_chk_guard + +target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128" +target triple = "i386-apple-macosx10.10.0" + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #0 + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #0 + +; Function Attrs: ssp +define void @foo(i1 %cond1, i1 %cond2) #1 { +entry: + %bitmapBuffer = alloca [8192 x i8], align 1 + br i1 %cond1, label %end1, label %bb1 + +bb1: + %bitmapBuffer229 = alloca [8192 x i8], align 1 + br i1 %cond2, label %end1, label %if.else130 + +end1: + ret void + +if.else130: ; preds = %bb1 + %tmp = getelementptr inbounds [8192 x i8]* %bitmapBuffer, i32 0, i32 0 + call void @llvm.lifetime.start(i64 8192, i8* %tmp) #0 + call void @llvm.lifetime.end(i64 8192, i8* %tmp) #0 + %tmp25 = getelementptr inbounds [8192 x i8]* %bitmapBuffer229, i32 0, i32 0 + call void @llvm.lifetime.start(i64 8192, i8* %tmp25) #0 + call void @llvm.lifetime.end(i64 8192, i8* %tmp25) #0 + br label %end1 +} + +declare void @bar() + +attributes #0 = { nounwind } +attributes #1 = { ssp }
\ No newline at end of file diff --git a/test/CodeGen/X86/elf-comdat.ll b/test/CodeGen/X86/elf-comdat.ll index c7e6df7d64f0..35d8d6f2d2af 100644 --- a/test/CodeGen/X86/elf-comdat.ll +++ b/test/CodeGen/X86/elf-comdat.ll @@ -1,8 +1,8 @@ ; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s $f = comdat any -@v = global i32 0, comdat $f -define void @f() comdat $f { +@v = global i32 0, comdat($f) +define void @f() comdat($f) { ret void } ; CHECK: .section .text.f,"axG",@progbits,f,comdat diff --git a/test/CodeGen/X86/elf-comdat2.ll b/test/CodeGen/X86/elf-comdat2.ll index 209da39ed881..786cec78cc30 100644 --- a/test/CodeGen/X86/elf-comdat2.ll +++ b/test/CodeGen/X86/elf-comdat2.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s $foo = comdat any -@bar = global i32 42, comdat $foo +@bar = global i32 42, comdat($foo) @foo = global i32 42 ; CHECK: .type bar,@object diff --git a/test/CodeGen/X86/empty-functions.ll b/test/CodeGen/X86/empty-functions.ll index ac5174db5fc5..42349688a710 100644 --- a/test/CodeGen/X86/empty-functions.ll +++ b/test/CodeGen/X86/empty-functions.ll @@ -1,10 +1,14 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck -check-prefix=CHECK-NO-FP %s ; RUN: llc < %s -mtriple=x86_64-apple-darwin -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s +; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=LINUX-NO-FP %s +; RUN: llc < %s -mtriple=x86_64-linux-gnu -disable-fp-elim | FileCheck -check-prefix=LINUX-FP %s define void @func() { entry: unreachable } + +; MachO cannot handle an empty function. ; CHECK-NO-FP: _func: ; CHECK-NO-FP-NEXT: .cfi_startproc ; CHECK-NO-FP: nop @@ -21,5 +25,30 @@ entry: ; CHECK-FP-NEXT: movq %rsp, %rbp ; CHECK-FP-NEXT: : ; CHECK-FP-NEXT: .cfi_def_cfa_register %rbp -; CHECK-FP-NEXT: nop ; CHECK-FP-NEXT: .cfi_endproc + +; An empty function is perfectly fine on ELF. +; LINUX-NO-FP: func: +; LINUX-NO-FP-NEXT: .cfi_startproc +; LINUX-NO-FP-NEXT: {{^}}# +; LINUX-NO-FP-NEXT: {{^}}.L{{.*}}:{{$}} +; LINUX-NO-FP-NEXT: .size func, .L{{.*}}-func +; LINUX-NO-FP-NEXT: .cfi_endproc + +; A cfi directive can point to the end of a function. It (and in fact the +; entire body) could be optimized out because of the unreachable, but we +; don't do it right now. +; LINUX-FP: func: +; LINUX-FP-NEXT: .cfi_startproc +; LINUX-FP-NEXT: {{^}}# +; LINUX-FP-NEXT: pushq %rbp +; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} +; LINUX-FP-NEXT: .cfi_def_cfa_offset 16 +; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}} +; LINUX-FP-NEXT: .cfi_offset %rbp, -16 +; LINUX-FP-NEXT: movq %rsp, %rbp +; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}} +; LINUX-FP-NEXT: .cfi_def_cfa_register %rbp +; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}} +; LINUX-FP-NEXT: .size func, .Ltmp3-func +; LINUX-FP-NEXT: .cfi_endproc diff --git a/test/CodeGen/X86/equiv_with_fndef.ll b/test/CodeGen/X86/equiv_with_fndef.ll new file mode 100644 index 000000000000..efbb8ab3da69 --- /dev/null +++ b/test/CodeGen/X86/equiv_with_fndef.ll @@ -0,0 +1,10 @@ +; RUN: not llc < %s 2>&1 | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +module asm ".equiv pselect, __pselect" + +define void @pselect() { + ret void +} +; CHECK: 'pselect' is a protected alias diff --git a/test/CodeGen/X86/equiv_with_vardef.ll b/test/CodeGen/X86/equiv_with_vardef.ll new file mode 100644 index 000000000000..29c19a107ec3 --- /dev/null +++ b/test/CodeGen/X86/equiv_with_vardef.ll @@ -0,0 +1,8 @@ +; RUN: not llc < %s 2>&1 | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +module asm ".equiv var, __var" + +@var = global i32 0 +; CHECK: symbol 'var' is already defined diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll index a18f75195631..ab92fe0d1d0c 100644 --- a/test/CodeGen/X86/exedepsfix-broadcast.ll +++ b/test/CodeGen/X86/exedepsfix-broadcast.ll @@ -93,10 +93,11 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> % ; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg -; ExeDepsFix works top down, thus it coalesces vmovlhps domain with -; vandps and there is nothing more you can do to match vmaxpd. -; CHECK: vmovlhps -; CHECK: vandps +; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with +; vpand and there is nothing more you can do to match vmaxpd. +; CHECK: vmovq +; CHECK: vpbroadcastq +; CHECK: vpand ; CHECK: vmaxpd ; CHECK: ret define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) { diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll index cadc0fb723f9..732f698f59ff 100644 --- a/test/CodeGen/X86/extractelement-load.ll +++ b/test/CodeGen/X86/extractelement-load.ll @@ -1,5 +1,8 @@ ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=yonah | FileCheck %s ; RUN: llc < %s -march=x86-64 -mattr=+sse2 -mcpu=core2 | FileCheck %s +; RUN: llc < %s -march=x86-64 -mattr=+avx -mcpu=btver2 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define i32 @t(<2 x i64>* %val) nounwind { ; CHECK-LABEL: t: @@ -23,3 +26,39 @@ undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3> %y = extractelement <8 x i32> %Shuff68, i32 0 ret i32 %y } + +; This case could easily end up inf-looping in the DAG combiner due to an +; low alignment load of the vector which prevents us from reliably forming a +; narrow load. + +; The expected codegen is identical for the AVX case except +; load/store instructions will have a leading 'v', so we don't +; need to special-case the checks. + +define void @t3() { +; CHECK-LABEL: t3: +; CHECK: movupd +; CHECK: movhpd + +bb: + %tmp13 = load <2 x double>* undef, align 1 + %.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1 + store double %.sroa.3.24.vec.extract, double* undef, align 8 + unreachable +} + +; Case where a load is unary shuffled, then bitcast (to a type with the same +; number of elements) before extractelement. +; This is testing for an assertion - the extraction was assuming that the undef +; second shuffle operand was a post-bitcast type instead of a pre-bitcast type. +define i64 @t4(<2 x double>* %a) { +; CHECK-LABEL: t4: +; CHECK: mov +; CHECK: ret + %b = load <2 x double>* %a, align 16 + %c = shufflevector <2 x double> %b, <2 x double> %b, <2 x i32> <i32 1, i32 0> + %d = bitcast <2 x double> %c to <2 x i64> + %e = extractelement <2 x i64> %d, i32 1 + ret i64 %e +} + diff --git a/test/CodeGen/X86/fast-isel-args-fail.ll b/test/CodeGen/X86/fast-isel-args-fail.ll index 7467edd74f21..7e783d2891d4 100644 --- a/test/CodeGen/X86/fast-isel-args-fail.ll +++ b/test/CodeGen/X86/fast-isel-args-fail.ll @@ -1,7 +1,6 @@ ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-apple-darwin10 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN32 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win64 | FileCheck %s -check-prefix=WIN64 -; REQUIRES: asserts ; Previously, this would cause an assert. define i31 @t1(i31 %a, i31 %b, i31 %c) { diff --git a/test/CodeGen/X86/fast-isel-branch_weights.ll b/test/CodeGen/X86/fast-isel-branch_weights.ll index bc41395e1e83..d2b02aad182d 100644 --- a/test/CodeGen/X86/fast-isel-branch_weights.ll +++ b/test/CodeGen/X86/fast-isel-branch_weights.ll @@ -16,4 +16,4 @@ success: ret i64 0 } -!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647} +!0 = !{!"branch_weights", i32 0, i32 2147483647} diff --git a/test/CodeGen/X86/fast-isel-call-bool.ll b/test/CodeGen/X86/fast-isel-call-bool.ll new file mode 100644 index 000000000000..5cdb2c941161 --- /dev/null +++ b/test/CodeGen/X86/fast-isel-call-bool.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -fast-isel -mcpu=core2 -mtriple=x86_64-unknown-unknown -O1 | FileCheck %s +; See PR21557 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @bar(i1) + +define i64 @foo(i8* %arg) { +; CHECK-LABEL: foo: +top: + %0 = load i8* %arg +; CHECK: movb + %1 = trunc i8 %0 to i1 +; CHECK: andb $1, + %2 = call i64 @bar(i1 %1) +; CHECK: callq + ret i64 %2 +} diff --git a/test/CodeGen/X86/fast-isel-cmp-branch.ll b/test/CodeGen/X86/fast-isel-cmp-branch.ll index 6e408f896663..684647ca9484 100644 --- a/test/CodeGen/X86/fast-isel-cmp-branch.ll +++ b/test/CodeGen/X86/fast-isel-cmp-branch.ll @@ -1,5 +1,5 @@ ; RUN: llc -O0 -mtriple=x86_64-linux -asm-verbose=false < %s | FileCheck %s -; RUN: llc -O0 -mtriple=x86_64-win32 -asm-verbose=false < %s | FileCheck %s +; RUN: llc -O0 -mtriple=x86_64-windows-itanium -asm-verbose=false < %s | FileCheck %s ; rdar://8337108 ; Fast-isel shouldn't try to look through the compare because it's in a diff --git a/test/CodeGen/X86/fast-isel-cmp-branch3.ll b/test/CodeGen/X86/fast-isel-cmp-branch3.ll index a3f6851ca240..0df782d18ecf 100644 --- a/test/CodeGen/X86/fast-isel-cmp-branch3.ll +++ b/test/CodeGen/X86/fast-isel-cmp-branch3.ll @@ -351,7 +351,7 @@ bb1: define i32 @icmp_eq(i32 %x) { ; CHECK-LABEL: icmp_eq ; CHECK-NOT: cmpl -; CHECK: movl $0, %eax +; CHECK: xorl %eax, %eax %1 = icmp eq i32 %x, %x br i1 %1, label %bb1, label %bb2 bb2: @@ -387,7 +387,7 @@ bb1: define i32 @icmp_uge(i32 %x) { ; CHECK-LABEL: icmp_uge ; CHECK-NOT: cmpl -; CHECK: movl $0, %eax +; CHECK: xorl %eax, %eax %1 = icmp uge i32 %x, %x br i1 %1, label %bb1, label %bb2 bb2: @@ -411,7 +411,7 @@ bb1: define i32 @icmp_ule(i32 %x) { ; CHECK-LABEL: icmp_ule ; CHECK-NOT: cmpl -; CHECK: movl $0, %eax +; CHECK: xorl %eax, %eax %1 = icmp ule i32 %x, %x br i1 %1, label %bb1, label %bb2 bb2: @@ -435,7 +435,7 @@ bb1: define i32 @icmp_sge(i32 %x) { ; CHECK-LABEL: icmp_sge ; CHECK-NOT: cmpl -; CHECK: movl $0, %eax +; CHECK: xorl %eax, %eax %1 = icmp sge i32 %x, %x br i1 %1, label %bb1, label %bb2 bb2: @@ -459,7 +459,7 @@ bb1: define i32 @icmp_sle(i32 %x) { ; CHECK-LABEL: icmp_sle ; CHECK-NOT: cmpl -; CHECK: movl $0, %eax +; CHECK: xorl %eax, %eax %1 = icmp sle i32 %x, %x br i1 %1, label %bb1, label %bb2 bb2: diff --git a/test/CodeGen/X86/fast-isel-constpool.ll b/test/CodeGen/X86/fast-isel-constpool.ll index bbbaeb233919..4e6f7c0f9e8e 100644 --- a/test/CodeGen/X86/fast-isel-constpool.ll +++ b/test/CodeGen/X86/fast-isel-constpool.ll @@ -1,19 +1,23 @@ -; RUN: llc < %s -fast-isel | FileCheck %s -; CHECK: LCPI0_0(%rip) +; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=small < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=large < %s | FileCheck %s --check-prefix=LARGE -; Make sure fast isel uses rip-relative addressing when required. -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" -target triple = "x86_64-apple-darwin9.0" +; Make sure fast isel uses rip-relative addressing for the small code model. +define float @constpool_float(float %x) { +; CHECK-LABEL: constpool_float +; CHECK: LCPI0_0(%rip) -define i32 @f0(double %x) nounwind { -entry: - %retval = alloca i32 ; <i32*> [#uses=2] - %x.addr = alloca double ; <double*> [#uses=2] - store double %x, double* %x.addr - %tmp = load double* %x.addr ; <double> [#uses=1] - %cmp = fcmp olt double %tmp, 8.500000e-01 ; <i1> [#uses=1] - %conv = zext i1 %cmp to i32 ; <i32> [#uses=1] - store i32 %conv, i32* %retval - %0 = load i32* %retval ; <i32> [#uses=1] - ret i32 %0 +; LARGE-LABEL: constpool_float +; LARGE: movabsq $LCPI0_0, %rax + %1 = fadd float %x, 16.50e+01 + ret float %1 +} + +define double @constpool_double(double %x) nounwind { +; CHECK-LABEL: constpool_double +; CHECK: LCPI1_0(%rip) + +; LARGE-LABEL: constpool_double +; LARGE: movabsq $LCPI1_0, %rax + %1 = fadd double %x, 8.500000e-01 + ret double %1 } diff --git a/test/CodeGen/X86/fast-isel-gep.ll b/test/CodeGen/X86/fast-isel-gep.ll index 4e47c7455c53..a65e0705f2b2 100644 --- a/test/CodeGen/X86/fast-isel-gep.ll +++ b/test/CodeGen/X86/fast-isel-gep.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=x86_64-linux -O0 | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=x86_64-win32 -O0 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=x86_64-windows-itanium -O0 | FileCheck %s --check-prefix=X64 ; RUN: llc < %s -march=x86 -O0 | FileCheck %s --check-prefix=X32 ; GEP indices are interpreted as signed integers, so they diff --git a/test/CodeGen/X86/fast-isel-mem.ll b/test/CodeGen/X86/fast-isel-mem.ll index cd2dc1d02c8a..eca1ae9f02a3 100644 --- a/test/CodeGen/X86/fast-isel-mem.ll +++ b/test/CodeGen/X86/fast-isel-mem.ll @@ -36,11 +36,11 @@ entry: store i32 (...)** getelementptr ([4 x i32 (...)*]* @LotsStuff, i32 0, i32 2), i32 (...)*** null, align 4 ret void ; CHECK: _t: -; CHECK: movl $0, %eax +; CHECK: xorl %eax, %eax ; CHECK: movl L_LotsStuff$non_lazy_ptr, %ecx ; ATOM: _t: ; ATOM: movl L_LotsStuff$non_lazy_ptr, %e{{..}} -; ATOM: movl $0, %e{{..}} +; ATOM: xorl %e{{..}}, %e{{..}} } diff --git a/test/CodeGen/X86/fast-isel-tls.ll b/test/CodeGen/X86/fast-isel-tls.ll index f71abd2fec01..686df43ac504 100644 --- a/test/CodeGen/X86/fast-isel-tls.ll +++ b/test/CodeGen/X86/fast-isel-tls.ll @@ -13,7 +13,7 @@ entry: ; CHECK: leal v@TLSGD ; CHECK: __tls_get_addr -@alias = alias internal i32* @v +@alias = internal alias i32* @v define i32 @f_alias() nounwind { entry: %t = load i32* @v diff --git a/test/CodeGen/X86/fast-isel-x32.ll b/test/CodeGen/X86/fast-isel-x32.ll new file mode 100644 index 000000000000..d49a10801065 --- /dev/null +++ b/test/CodeGen/X86/fast-isel-x32.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel -fast-isel-abort | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-nacl -fast-isel -fast-isel-abort | FileCheck %s + +; Test that alloca addresses are materialized with the right size instruction. + +declare void @bar(i32* %arg) + +; CHECK-LABEL: @foo +define void @foo() { + %a = alloca i32 +; CHECK: leal {{.*}}, %edi + call void @bar(i32* %a) + ret void +} diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll index f7d2750b5b81..3747d049424d 100644 --- a/test/CodeGen/X86/fast-isel-x86-64.ll +++ b/test/CodeGen/X86/fast-isel-x86-64.ll @@ -144,7 +144,7 @@ if.end: ; preds = %if.then, %entry ; CHECK-LABEL: test12: ; CHECK: testb $1, ; CHECK-NEXT: je L -; CHECK-NEXT: movl $0, %edi +; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: callq } @@ -154,7 +154,7 @@ define void @test13() nounwind { call void @test13f(i1 0) ret void ; CHECK-LABEL: test13: -; CHECK: movl $0, %edi +; CHECK: xorl %edi, %edi ; CHECK-NEXT: callq } @@ -194,12 +194,10 @@ define void @test16() nounwind { br label %block2 block2: -; CHECK: movabsq $1 -; CHECK: cvtsi2sdq {{.*}} %xmm0 +; CHECK: movsd LCP{{.*}}_{{.*}}(%rip), %xmm0 ; CHECK: movb $1, %al ; CHECK: callq _test16callee -; AVX: movabsq $1 ; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0 ; AVX: movb $1, %al ; AVX: callq _test16callee @@ -280,7 +278,7 @@ entry: call void @foo22(i32 3) ret void ; CHECK-LABEL: test22: -; CHECK: movl $0, %edi +; CHECK: xorl %edi, %edi ; CHECK: callq _foo22 ; CHECK: movl $1, %edi ; CHECK: callq _foo22 @@ -304,3 +302,13 @@ define void @test23(i8* noalias sret %result) { } declare i8* @foo23() + +declare void @takesi32ptr(i32* %arg) + +; CHECK-LABEL: allocamaterialize +define void @allocamaterialize() { + %a = alloca i32 +; CHECK: leaq {{.*}}, %rdi + call void @takesi32ptr(i32* %a) + ret void +} diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll index a212a7c6876e..61e9b98f6e7e 100644 --- a/test/CodeGen/X86/fast-isel-x86.ll +++ b/test/CodeGen/X86/fast-isel-x86.ll @@ -60,3 +60,21 @@ entry: ; CHECK: addl $28 } declare fastcc void @test4fastccsret(%struct.a* sret) + + +; Check that fast-isel cleans up when it fails to lower a call instruction. +define void @test5() { +entry: + %call = call i32 @test5dllimport(i32 42) + ret void +; CHECK-LABEL: test5: +; Local value area is still there: +; CHECK: movl $42, {{%[a-z]+}} +; Fast-ISel's arg push is not here: +; CHECK-NOT: movl $42, (%esp) +; SDag-ISel's arg push: +; CHECK: movl %esp, [[REGISTER:%[a-z]+]] +; CHECK: movl $42, ([[REGISTER]]) +; CHECK: movl __imp__test5dllimport +} +declare dllimport i32 @test5dllimport(i32) diff --git a/test/CodeGen/X86/fastmath-optnone.ll b/test/CodeGen/X86/fastmath-optnone.ll new file mode 100644 index 000000000000..0caadff89167 --- /dev/null +++ b/test/CodeGen/X86/fastmath-optnone.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -mcpu=corei7 -march=x86-64 -mattr=+sse2 | FileCheck %s +; Verify that floating-point operations inside 'optnone' functions +; are not optimized even if unsafe-fp-math is set. + +define float @foo(float %x) #0 { +entry: + %add = fadd fast float %x, %x + %add1 = fadd fast float %add, %x + ret float %add1 +} + +; CHECK-LABEL: @foo +; CHECK-NOT: add +; CHECK: mul +; CHECK-NOT: add +; CHECK: ret + +define float @fooWithOptnone(float %x) #1 { +entry: + %add = fadd fast float %x, %x + %add1 = fadd fast float %add, %x + ret float %add1 +} + +; CHECK-LABEL: @fooWithOptnone +; CHECK-NOT: mul +; CHECK: add +; CHECK-NOT: mul +; CHECK: add +; CHECK-NOT: mul +; CHECK: ret + + +attributes #0 = { "unsafe-fp-math"="true" } +attributes #1 = { noinline optnone "unsafe-fp-math"="true" } diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma-intrinsics-x86_64.ll index 494cb28677a4..aadd7311bb89 100644 --- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll +++ b/test/CodeGen/X86/fma-intrinsics-x86_64.ll @@ -1,316 +1,278 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK-FMA --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK-FMA4 --check-prefix=CHECK ; VFMADD define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfmaddss - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] - ret < 4 x float > %res -} -define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) { - ; CHECK: vfmaddss (%{{.*}}) - %x = load float *%a2 - %y = insertelement <4 x float> undef, float %x, i32 0 - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y) ; <i64> [#uses=1] - ret < 4 x float > %res -} -define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) { - ; CHECK: vfmaddss %{{.*}}, (%{{.*}}) - %x = load float *%a1 - %y = insertelement <4 x float> undef, float %x, i32 0 - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmaddss + ; CHECK-FMA: vfmadd213ss + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfmaddsd - %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] - ret < 2 x double > %res -} -define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) { - ; CHECK: vfmaddsd (%{{.*}}) - %x = load double *%a2 - %y = insertelement <2 x double> undef, double %x, i32 0 - %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y) ; <i64> [#uses=1] - ret < 2 x double > %res -} -define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) { - ; CHECK: vfmaddsd %{{.*}}, (%{{.*}}) - %x = load double *%a1 - %y = insertelement <2 x double> undef, double %x, i32 0 - %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmaddsd + ; CHECK-FMA: vfmadd213sd + %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 4 x float > @test_x86_fma_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfmaddps - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] - ret < 4 x float > %res -} -define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) { - ; CHECK: vfmaddps (%{{.*}}) - %x = load <4 x float>* %a2 - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x) ; <i64> [#uses=1] - ret < 4 x float > %res -} -define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) { - ; CHECK: vfmaddps %{{.*}}, (%{{.*}}) - %x = load <4 x float>* %a1 - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmaddps + ; CHECK-FMA: vfmadd213ps + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone -; To test execution dependency -define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) { - ; CHECK: vmovaps - ; CHECK: vfmaddps %{{.*}}, (%{{.*}}) - %x = load <4 x float>* %a0 - %y = load <4 x float>* %a1 - %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %x, < 4 x float > %y, < 4 x float > %a2) ; <i64> [#uses=1] - ret < 4 x float > %res -} - define < 2 x double > @test_x86_fma_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfmaddpd - %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] - ret < 2 x double > %res -} -define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) { - ; CHECK: vfmaddpd (%{{.*}}) - %x = load <2 x double>* %a2 - %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x) ; <i64> [#uses=1] - ret < 2 x double > %res -} -define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) { - ; CHECK: vfmaddpd %{{.*}}, (%{{.*}}) - %x = load <2 x double>* %a1 - %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmaddpd + ; CHECK-FMA: vfmadd213pd + %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone -; To test execution dependency -define < 2 x double > @test_x86_fma_vfmadd_pd_load3(< 2 x double >* %a0, < 2 x double >* %a1, < 2 x double > %a2) { - ; CHECK: vmovapd - ; CHECK: vfmaddpd %{{.*}}, (%{{.*}}) - %x = load <2 x double>* %a0 - %y = load <2 x double>* %a1 - %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %x, < 2 x double > %y, < 2 x double > %a2) ; <i64> [#uses=1] - ret < 2 x double > %res -} - define < 8 x float > @test_x86_fma_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK: vfmaddps + ; CHECK-FMA4: vfmaddps + ; CHECK-FMA: vfmadd213ps ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1] + %res = call < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ret < 8 x float > %res } declare < 8 x float > @llvm.x86.fma.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone define < 4 x double > @test_x86_fma_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK: vfmaddpd + ; CHECK-FMA4: vfmaddpd + ; CHECK-FMA: vfmadd213pd ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1] + %res = call < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ret < 4 x double > %res } declare < 4 x double > @llvm.x86.fma.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone ; VFMSUB define < 4 x float > @test_x86_fma_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfmsubss - %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmsubss + ; CHECK-FMA: vfmsub213ss + %res = call < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfmsubsd - %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmsubsd + ; CHECK-FMA: vfmsub213sd + %res = call < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 4 x float > @test_x86_fma_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfmsubps - %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmsubps + ; CHECK-FMA: vfmsub213ps + %res = call < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfmsubpd - %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmsubpd + ; CHECK-FMA: vfmsub213pd + %res = call < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 8 x float > @test_x86_fma_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK: vfmsubps + ; CHECK-FMA4: vfmsubps + ; CHECK-FMA: vfmsub213ps ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1] + %res = call < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ret < 8 x float > %res } declare < 8 x float > @llvm.x86.fma.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone define < 4 x double > @test_x86_fma_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK: vfmsubpd + ; CHECK-FMA4: vfmsubpd + ; CHECK-FMA: vfmsub213pd ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1] + %res = call < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ret < 4 x double > %res } declare < 4 x double > @llvm.x86.fma.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone ; VFNMADD define < 4 x float > @test_x86_fma_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfnmaddss - %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmaddss + ; CHECK-FMA: vfnmadd213ss + %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfnmaddsd - %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmaddsd + ; CHECK-FMA: vfnmadd213sd + %res = call < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 4 x float > @test_x86_fma_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfnmaddps - %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmaddps + ; CHECK-FMA: vfnmadd213ps + %res = call < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfnmaddpd - %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmaddpd + ; CHECK-FMA: vfnmadd213pd + %res = call < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 8 x float > @test_x86_fma_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK: vfnmaddps + ; CHECK-FMA4: vfnmaddps + ; CHECK-FMA: vfnmadd213ps ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1] + %res = call < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ret < 8 x float > %res } declare < 8 x float > @llvm.x86.fma.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone define < 4 x double > @test_x86_fma_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK: vfnmaddpd + ; CHECK-FMA4: vfnmaddpd + ; CHECK-FMA: vfnmadd213pd ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1] + %res = call < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ret < 4 x double > %res } declare < 4 x double > @llvm.x86.fma.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone ; VFNMSUB define < 4 x float > @test_x86_fma_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfnmsubss - %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmsubss + ; CHECK-FMA: vfnmsub213ss + %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfnmsubsd - %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmsubsd + ; CHECK-FMA: vfnmsub213sd + %res = call < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 4 x float > @test_x86_fma_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfnmsubps - %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmsubps + ; CHECK-FMA: vfnmsub213ps + %res = call < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfnmsubpd - %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfnmsubpd + ; CHECK-FMA: vfnmsub213pd + %res = call < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 8 x float > @test_x86_fma_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK: vfnmsubps + ; CHECK-FMA4: vfnmsubps + ; CHECK-FMA: vfnmsub213ps ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1] + %res = call < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ret < 8 x float > %res } declare < 8 x float > @llvm.x86.fma.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone define < 4 x double > @test_x86_fma_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK: vfnmsubpd + ; CHECK-FMA4: vfnmsubpd + ; CHECK-FMA: vfnmsub213pd ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1] + %res = call < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ret < 4 x double > %res } declare < 4 x double > @llvm.x86.fma.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone ; VFMADDSUB define < 4 x float > @test_x86_fma_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfmaddsubps - %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmaddsubps + ; CHECK-FMA: vfmaddsub213ps + %res = call < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfmaddsubpd - %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmaddsubpd + ; CHECK-FMA: vfmaddsub213pd + %res = call < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 8 x float > @test_x86_fma_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK: vfmaddsubps + ; CHECK-FMA4: vfmaddsubps + ; CHECK-FMA: vfmaddsub213ps ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1] + %res = call < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ret < 8 x float > %res } declare < 8 x float > @llvm.x86.fma.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone define < 4 x double > @test_x86_fma_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK: vfmaddsubpd + ; CHECK-FMA4: vfmaddsubpd + ; CHECK-FMA: vfmaddsub213pd ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1] + %res = call < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ret < 4 x double > %res } declare < 4 x double > @llvm.x86.fma.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone ; VFMSUBADD define < 4 x float > @test_x86_fma_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) { - ; CHECK: vfmsubaddps - %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmsubaddps + ; CHECK-FMA: vfmsubadd213ps + %res = call < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ret < 4 x float > %res } declare < 4 x float > @llvm.x86.fma.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone define < 2 x double > @test_x86_fma_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { - ; CHECK: vfmsubaddpd - %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ; CHECK-FMA4: vfmsubaddpd + ; CHECK-FMA: vfmsubadd213pd + %res = call < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ret < 2 x double > %res } declare < 2 x double > @llvm.x86.fma.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone define < 8 x float > @test_x86_fma_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) { - ; CHECK: vfmsubaddps + ; CHECK-FMA4: vfmsubaddps + ; CHECK-FMA: vfmsubadd213ps ; CHECK: ymm - %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1] + %res = call < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ret < 8 x float > %res } declare < 8 x float > @llvm.x86.fma.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone define < 4 x double > @test_x86_fma_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) { - ; CHECK: vfmsubaddpd + ; CHECK-FMA4: vfmsubaddpd + ; CHECK-FMA: vfmsubadd213pd ; CHECK: ymm - %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1] + %res = call < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ret < 4 x double > %res } declare < 4 x double > @llvm.x86.fma.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone diff --git a/test/CodeGen/X86/fma-phi-213-to-231.ll b/test/CodeGen/X86/fma-phi-213-to-231.ll new file mode 100644 index 000000000000..9715bc7b328b --- /dev/null +++ b/test/CodeGen/X86/fma-phi-213-to-231.ll @@ -0,0 +1,246 @@ +; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +; CHECK-LABEL: fmaddsubpd_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmaddsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + +; CHECK-LABEL: fmsubaddpd_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmsubadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + +; CHECK-LABEL: fmaddpd_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + +; CHECK-LABEL: fmsubpd_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + +declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>) +declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>) +declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) +declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) + + +; CHECK-LABEL: fmaddsubps_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmaddsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + +; CHECK-LABEL: fmsubaddps_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmsubadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + +; CHECK-LABEL: fmaddps_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + +; CHECK-LABEL: fmsubps_loop +; CHECK: [[BODYLBL:LBB.+]]: +; CHECK: vfmsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} +; CHECK: [[INCLBL:LBB.+]]: +; CHECK: addl $1, [[INDREG:%[a-z0-9]+]] +; CHECK: cmpl {{%.+}}, [[INDREG]] +; CHECK: jl [[BODYLBL]] +define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + +declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll new file mode 100644 index 000000000000..64a2068aea43 --- /dev/null +++ b/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll @@ -0,0 +1,84 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s + +; VFMADD +define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) { + ; CHECK: vfmaddss (%{{.*}}) + %x = load float *%a2 + %y = insertelement <4 x float> undef, float %x, i32 0 + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y) + ret < 4 x float > %res +} +define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) { + ; CHECK: vfmaddss %{{.*}}, (%{{.*}}) + %x = load float *%a1 + %y = insertelement <4 x float> undef, float %x, i32 0 + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2) + ret < 4 x float > %res +} + +declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone + +define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) { + ; CHECK: vfmaddsd (%{{.*}}) + %x = load double *%a2 + %y = insertelement <2 x double> undef, double %x, i32 0 + %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y) + ret < 2 x double > %res +} +define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) { + ; CHECK: vfmaddsd %{{.*}}, (%{{.*}}) + %x = load double *%a1 + %y = insertelement <2 x double> undef, double %x, i32 0 + %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2) + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone +define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) { + ; CHECK: vfmaddps (%{{.*}}) + %x = load <4 x float>* %a2 + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x) + ret < 4 x float > %res +} +define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) { + ; CHECK: vfmaddps %{{.*}}, (%{{.*}}) + %x = load <4 x float>* %a1 + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2) + ret < 4 x float > %res +} +declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone + +; To test execution dependency +define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) { + ; CHECK: vmovaps + ; CHECK: vfmaddps %{{.*}}, (%{{.*}}) + %x = load <4 x float>* %a0 + %y = load <4 x float>* %a1 + %res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %x, < 4 x float > %y, < 4 x float > %a2) + ret < 4 x float > %res +} + +define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) { + ; CHECK: vfmaddpd (%{{.*}}) + %x = load <2 x double>* %a2 + %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x) + ret < 2 x double > %res +} +define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) { + ; CHECK: vfmaddpd %{{.*}}, (%{{.*}}) + %x = load <2 x double>* %a1 + %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2) + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone + +; To test execution dependency +define < 2 x double > @test_x86_fma_vfmadd_pd_load3(< 2 x double >* %a0, < 2 x double >* %a1, < 2 x double > %a2) { + ; CHECK: vmovapd + ; CHECK: vfmaddpd %{{.*}}, (%{{.*}}) + %x = load <2 x double>* %a0 + %y = load <2 x double>* %a1 + %res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %x, < 2 x double > %y, < 2 x double > %a2) + ret < 2 x double > %res +} + diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll index cfb598df634c..9b52db9f14e9 100644 --- a/test/CodeGen/X86/fma_patterns.ll +++ b/test/CodeGen/X86/fma_patterns.ll @@ -184,7 +184,7 @@ define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) { ; CHECK: test_x86_fmadd_ps_load ; CHECK: vmovaps (%rdi), %xmm2 -; CHECK: vfmadd213ps %xmm1, %xmm0, %xmm2 +; CHECK: vfmadd213ps %xmm1, %xmm2, %xmm0 ; CHECK: ret ; CHECK_FMA4: test_x86_fmadd_ps_load ; CHECK_FMA4: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0 @@ -198,7 +198,7 @@ define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 ; CHECK: test_x86_fmsub_ps_load ; CHECK: vmovaps (%rdi), %xmm2 -; CHECK: fmsub213ps %xmm1, %xmm0, %xmm2 +; CHECK: fmsub213ps %xmm1, %xmm2, %xmm0 ; CHECK: ret ; CHECK_FMA4: test_x86_fmsub_ps_load ; CHECK_FMA4: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0 diff --git a/test/CodeGen/X86/fmaxnum.ll b/test/CodeGen/X86/fmaxnum.ll new file mode 100644 index 000000000000..23678c46dba0 --- /dev/null +++ b/test/CodeGen/X86/fmaxnum.ll @@ -0,0 +1,50 @@ +; RUN: llc -march=x86 -mtriple=i386-linux-gnu < %s | FileCheck %s + +declare float @fmaxf(float, float) +declare double @fmax(double, double) +declare x86_fp80 @fmaxl(x86_fp80, x86_fp80) +declare float @llvm.maxnum.f32(float, float) +declare double @llvm.maxnum.f64(double, double) +declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) + +; CHECK-LABEL: @test_fmaxf +; CHECK: calll fmaxf +define float @test_fmaxf(float %x, float %y) { + %z = call float @fmaxf(float %x, float %y) readnone + ret float %z +} + +; CHECK-LABEL: @test_fmax +; CHECK: calll fmax +define double @test_fmax(double %x, double %y) { + %z = call double @fmax(double %x, double %y) readnone + ret double %z +} + +; CHECK-LABEL: @test_fmaxl +; CHECK: calll fmaxl +define x86_fp80 @test_fmaxl(x86_fp80 %x, x86_fp80 %y) { + %z = call x86_fp80 @fmaxl(x86_fp80 %x, x86_fp80 %y) readnone + ret x86_fp80 %z +} + +; CHECK-LABEL: @test_intrinsic_fmaxf +; CHECK: calll fmaxf +define float @test_intrinsic_fmaxf(float %x, float %y) { + %z = call float @llvm.maxnum.f32(float %x, float %y) readnone + ret float %z +} + +; CHECK-LABEL: @test_intrinsic_fmax +; CHECK: calll fmax +define double @test_intrinsic_fmax(double %x, double %y) { + %z = call double @llvm.maxnum.f64(double %x, double %y) readnone + ret double %z +} + +; CHECK-LABEL: @test_intrinsic_fmaxl +; CHECK: calll fmaxl +define x86_fp80 @test_intrinsic_fmaxl(x86_fp80 %x, x86_fp80 %y) { + %z = call x86_fp80 @llvm.maxnum.f80(x86_fp80 %x, x86_fp80 %y) readnone + ret x86_fp80 %z +} diff --git a/test/CodeGen/X86/fminnum.ll b/test/CodeGen/X86/fminnum.ll new file mode 100644 index 000000000000..1e33cf4696af --- /dev/null +++ b/test/CodeGen/X86/fminnum.ll @@ -0,0 +1,95 @@ +; RUN: llc -march=x86 -mtriple=i386-linux-gnu -mattr=+sse,+sse2 < %s | FileCheck %s + +declare float @fminf(float, float) +declare double @fmin(double, double) +declare x86_fp80 @fminl(x86_fp80, x86_fp80) +declare float @llvm.minnum.f32(float, float) +declare double @llvm.minnum.f64(double, double) +declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) + +declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) +declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) + +; CHECK-LABEL: @test_fminf +; CHECK: jmp fminf +define float @test_fminf(float %x, float %y) { + %z = call float @fminf(float %x, float %y) readnone + ret float %z +} + +; CHECK-LABEL: @test_fmin +; CHECK: jmp fmin +define double @test_fmin(double %x, double %y) { + %z = call double @fmin(double %x, double %y) readnone + ret double %z +} + +; CHECK-LABEL: @test_fminl +; CHECK: calll fminl +define x86_fp80 @test_fminl(x86_fp80 %x, x86_fp80 %y) { + %z = call x86_fp80 @fminl(x86_fp80 %x, x86_fp80 %y) readnone + ret x86_fp80 %z +} + +; CHECK-LABEL: @test_intrinsic_fminf +; CHECK: jmp fminf +define float @test_intrinsic_fminf(float %x, float %y) { + %z = call float @llvm.minnum.f32(float %x, float %y) readnone + ret float %z +} + +; CHECK-LABEL: @test_intrinsic_fmin +; CHECK: jmp fmin +define double @test_intrinsic_fmin(double %x, double %y) { + %z = call double @llvm.minnum.f64(double %x, double %y) readnone + ret double %z +} + +; CHECK-LABEL: @test_intrinsic_fminl +; CHECK: calll fminl +define x86_fp80 @test_intrinsic_fminl(x86_fp80 %x, x86_fp80 %y) { + %z = call x86_fp80 @llvm.minnum.f80(x86_fp80 %x, x86_fp80 %y) readnone + ret x86_fp80 %z +} + +; CHECK-LABEL: @test_intrinsic_fmin_v2f32 +; CHECK: calll fminf +; CHECK: calll fminf +define <2 x float> @test_intrinsic_fmin_v2f32(<2 x float> %x, <2 x float> %y) { + %z = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %y) readnone + ret <2 x float> %z +} + +; CHECK-LABEL: @test_intrinsic_fmin_v4f32 +; CHECK: calll fminf +; CHECK: calll fminf +; CHECK: calll fminf +; CHECK: calll fminf +define <4 x float> @test_intrinsic_fmin_v4f32(<4 x float> %x, <4 x float> %y) { + %z = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) readnone + ret <4 x float> %z +} + +; CHECK-LABEL: @test_intrinsic_fmin_v2f64 +; CHECK: calll fmin +; CHECK: calll fmin +define <2 x double> @test_intrinsic_fmin_v2f64(<2 x double> %x, <2 x double> %y) { + %z = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y) readnone + ret <2 x double> %z +} + +; CHECK-LABEL: @test_intrinsic_fmin_v8f64 +; CHECK: calll fmin +; CHECK: calll fmin +; CHECK: calll fmin +; CHECK: calll fmin +; CHECK: calll fmin +; CHECK: calll fmin +; CHECK: calll fmin +; CHECK: calll fmin +define <8 x double> @test_intrinsic_fmin_v8f64(<8 x double> %x, <8 x double> %y) { + %z = call <8 x double> @llvm.minnum.v8f64(<8 x double> %x, <8 x double> %y) readnone + ret <8 x double> %z +} diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll new file mode 100644 index 000000000000..703651153c11 --- /dev/null +++ b/test/CodeGen/X86/fmul-combines.ll @@ -0,0 +1,147 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -march=x86-64 < %s | FileCheck %s + +; CHECK-LABEL: fmul2_f32: +; CHECK: addss %xmm0, %xmm0 +define float @fmul2_f32(float %x) { + %y = fmul float %x, 2.0 + ret float %y +} + +; fmul 2.0, x -> fadd x, x for vectors. + +; CHECK-LABEL: fmul2_v4f32: +; CHECK: addps %xmm0, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fmul2_v4f32(<4 x float> %x) { + %y = fmul <4 x float> %x, <float 2.0, float 2.0, float 2.0, float 2.0> + ret <4 x float> %y +} + +; CHECK-LABEL: constant_fold_fmul_v4f32: +; CHECK: movaps +; CHECK-NEXT: ret +define <4 x float> @constant_fold_fmul_v4f32(<4 x float> %x) { + %y = fmul <4 x float> <float 4.0, float 4.0, float 4.0, float 4.0>, <float 2.0, float 2.0, float 2.0, float 2.0> + ret <4 x float> %y +} + +; CHECK-LABEL: fmul0_v4f32: +; CHECK: xorps %xmm0, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fmul0_v4f32(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, <float 0.0, float 0.0, float 0.0, float 0.0> + ret <4 x float> %y +} + +; CHECK-LABEL: fmul_c2_c4_v4f32: +; CHECK-NOT: addps +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_c2_c4_v4f32(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, <float 2.0, float 2.0, float 2.0, float 2.0> + %z = fmul <4 x float> %y, <float 4.0, float 4.0, float 4.0, float 4.0> + ret <4 x float> %z +} + +; CHECK-LABEL: fmul_c3_c4_v4f32: +; CHECK-NOT: addps +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_c3_c4_v4f32(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, <float 3.0, float 3.0, float 3.0, float 3.0> + %z = fmul <4 x float> %y, <float 4.0, float 4.0, float 4.0, float 4.0> + ret <4 x float> %z +} + +; We should be able to pre-multiply the two constant vectors. +; CHECK: float 5.000000e+00 +; CHECK: float 1.200000e+01 +; CHECK: float 2.100000e+01 +; CHECK: float 3.200000e+01 +; CHECK-LABEL: fmul_v4f32_two_consts_no_splat: +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_v4f32_two_consts_no_splat(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0> + %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0> + ret <4 x float> %z +} + +; Same as above, but reverse operands to make sure non-canonical form is also handled. +; CHECK: float 5.000000e+00 +; CHECK: float 1.200000e+01 +; CHECK: float 2.100000e+01 +; CHECK: float 3.200000e+01 +; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_non_canonical: +; CHECK: mulps +; CHECK-NOT: mulps +; CHECK-NEXT: ret +define <4 x float> @fmul_v4f32_two_consts_no_splat_non_canonical(<4 x float> %x) #0 { + %y = fmul <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x + %z = fmul <4 x float> <float 5.0, float 6.0, float 7.0, float 8.0>, %y + ret <4 x float> %z +} + +; More than one use of a constant multiply should not inhibit the optimization. +; Instead of a chain of 2 dependent mults, this test will have 2 independent mults. +; CHECK: float 5.000000e+00 +; CHECK: float 1.200000e+01 +; CHECK: float 2.100000e+01 +; CHECK: float 3.200000e+01 +; CHECK-LABEL: fmul_v4f32_two_consts_no_splat_multiple_use: +; CHECK: mulps +; CHECK: mulps +; CHECK: addps +; CHECK: ret +define <4 x float> @fmul_v4f32_two_consts_no_splat_multiple_use(<4 x float> %x) #0 { + %y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0> + %z = fmul <4 x float> %y, <float 5.0, float 6.0, float 7.0, float 8.0> + %a = fadd <4 x float> %y, %z + ret <4 x float> %a +} + +; CHECK-LABEL: fmul_c2_c4_f32: +; CHECK-NOT: addss +; CHECK: mulss +; CHECK-NOT: mulss +; CHECK-NEXT: ret +define float @fmul_c2_c4_f32(float %x) #0 { + %y = fmul float %x, 2.0 + %z = fmul float %y, 4.0 + ret float %z +} + +; CHECK-LABEL: fmul_c3_c4_f32: +; CHECK-NOT: addss +; CHECK: mulss +; CHECK-NOT: mulss +; CHECK-NET: ret +define float @fmul_c3_c4_f32(float %x) #0 { + %y = fmul float %x, 3.0 + %z = fmul float %y, 4.0 + ret float %z +} + +; CHECK-LABEL: fmul_fneg_fneg_f32: +; CHECK: mulss %xmm1, %xmm0 +; CHECK-NEXT: retq +define float @fmul_fneg_fneg_f32(float %x, float %y) { + %x.neg = fsub float -0.0, %x + %y.neg = fsub float -0.0, %y + %mul = fmul float %x.neg, %y.neg + ret float %mul +} +; CHECK-LABEL: fmul_fneg_fneg_v4f32: +; CHECK: mulps {{%xmm1|\(%rdx\)}}, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fmul_fneg_fneg_v4f32(<4 x float> %x, <4 x float> %y) { + %x.neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %x + %y.neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %y + %mul = fmul <4 x float> %x.neg, %y.neg + ret <4 x float> %mul +} + +attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } diff --git a/test/CodeGen/X86/fnabs.ll b/test/CodeGen/X86/fnabs.ll new file mode 100644 index 000000000000..19718d3ff926 --- /dev/null +++ b/test/CodeGen/X86/fnabs.ll @@ -0,0 +1,77 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx| FileCheck %s + +; Verify that we generate a single OR instruction for a scalar, vec128, and vec256 +; FNABS(x) operation -> FNEG (FABS(x)). +; If the FABS() result isn't used, the AND instruction should be eliminated. +; PR20578: http://llvm.org/bugs/show_bug.cgi?id=20578 + +define float @scalar_no_abs(float %a) { +; CHECK-LABEL: scalar_no_abs: +; CHECK: vorps +; CHECK-NEXT: retq + %fabs = tail call float @fabsf(float %a) #1 + %fsub = fsub float -0.0, %fabs + ret float %fsub +} + +define float @scalar_uses_abs(float %a) { +; CHECK-LABEL: scalar_uses_abs: +; CHECK-DAG: vandps +; CHECK-DAG: vorps +; CHECK: vmulss +; CHECK-NEXT: retq + %fabs = tail call float @fabsf(float %a) #1 + %fsub = fsub float -0.0, %fabs + %fmul = fmul float %fsub, %fabs + ret float %fmul +} + +define <4 x float> @vector128_no_abs(<4 x float> %a) { +; CHECK-LABEL: vector128_no_abs: +; CHECK: vorps +; CHECK-NEXT: retq + %fabs = tail call <4 x float> @llvm.fabs.v4f32(< 4 x float> %a) #1 + %fsub = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %fabs + ret <4 x float> %fsub +} + +define <4 x float> @vector128_uses_abs(<4 x float> %a) { +; CHECK-LABEL: vector128_uses_abs: +; CHECK-DAG: vandps +; CHECK-DAG: vorps +; CHECK: vmulps +; CHECK-NEXT: retq + %fabs = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #1 + %fsub = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %fabs + %fmul = fmul <4 x float> %fsub, %fabs + ret <4 x float> %fmul +} + +define <8 x float> @vector256_no_abs(<8 x float> %a) { +; CHECK-LABEL: vector256_no_abs: +; CHECK: vorps +; CHECK-NEXT: retq + %fabs = tail call <8 x float> @llvm.fabs.v8f32(< 8 x float> %a) #1 + %fsub = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %fabs + ret <8 x float> %fsub +} + +define <8 x float> @vector256_uses_abs(<8 x float> %a) { +; CHECK-LABEL: vector256_uses_abs: +; CHECK-DAG: vandps +; CHECK-DAG: vorps +; CHECK: vmulps +; CHECK-NEXT: retq + %fabs = tail call <8 x float> @llvm.fabs.v8f32(<8 x float> %a) #1 + %fsub = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %fabs + %fmul = fmul <8 x float> %fsub, %fabs + ret <8 x float> %fmul +} + +declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p) +declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p) + +declare float @fabsf(float) + +attributes #1 = { readnone } + diff --git a/test/CodeGen/X86/fold-pcmpeqd-0.ll b/test/CodeGen/X86/fold-pcmpeqd-0.ll deleted file mode 100644 index 1d315ffe359b..000000000000 --- a/test/CodeGen/X86/fold-pcmpeqd-0.ll +++ /dev/null @@ -1,117 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=X86-64 %s -; DISABLED: llc < %s -mtriple=i386-apple-darwin -mcpu=yonah -regalloc=linearscan | FileCheck --check-prefix=I386 %s - -; i386 test has been disabled when scheduler 2-addr hack is disabled. - -; This testcase shouldn't need to spill the -1 value, -; so it should just use pcmpeqd to materialize an all-ones vector. -; For i386, cp load of -1 are folded. - -; With -regalloc=greedy, the live range is split before spilling, so the first -; pcmpeq doesn't get folded as a constant pool load. - -; I386-NOT: pcmpeqd -; I386: orps LCPI0_2, %xmm -; I386-NOT: pcmpeqd -; I386: orps LCPI0_2, %xmm - -; X86-64: pcmpeqd -; X86-64-NOT: pcmpeqd - - %struct.__ImageExecInfo = type <{ <4 x i32>, <4 x float>, <2 x i64>, i8*, i8*, i8*, i32, i32, i32, i32, i32 }> - %struct._cl_image_format_t = type <{ i32, i32, i32 }> - %struct._image2d_t = type <{ i8*, %struct._cl_image_format_t, i32, i32, i32, i32, i32, i32 }> - -define void @program_1(%struct._image2d_t* %dest, %struct._image2d_t* %t0, <4 x float> %p0, <4 x float> %p1, <4 x float> %p4, <4 x float> %p5, <4 x float> %p6) nounwind { -entry: - %tmp3.i = load i32* null ; <i32> [#uses=1] - %cmp = icmp sgt i32 %tmp3.i, 200 ; <i1> [#uses=1] - br i1 %cmp, label %forcond, label %ifthen - -ifthen: ; preds = %entry - ret void - -forcond: ; preds = %entry - %tmp3.i536 = load i32* null ; <i32> [#uses=1] - %cmp12 = icmp slt i32 0, %tmp3.i536 ; <i1> [#uses=1] - br i1 %cmp12, label %forbody, label %afterfor - -forbody: ; preds = %forcond - %bitcast204.i313 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1] - %mul233 = fmul <4 x float> %bitcast204.i313, zeroinitializer ; <<4 x float>> [#uses=1] - %mul257 = fmul <4 x float> %mul233, zeroinitializer ; <<4 x float>> [#uses=1] - %mul275 = fmul <4 x float> %mul257, zeroinitializer ; <<4 x float>> [#uses=1] - %tmp51 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %mul275, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1] - %bitcast198.i182 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0] - %bitcast204.i185 = bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>> [#uses=1] - %tmp69 = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> zeroinitializer) nounwind ; <<4 x i32>> [#uses=1] - %tmp70 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %tmp69) nounwind ; <<4 x float>> [#uses=1] - %sub140.i78 = fsub <4 x float> zeroinitializer, %tmp70 ; <<4 x float>> [#uses=2] - %mul166.i86 = fmul <4 x float> zeroinitializer, %sub140.i78 ; <<4 x float>> [#uses=1] - %add167.i87 = fadd <4 x float> %mul166.i86, < float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000, float 0x3FE62ACB60000000 > ; <<4 x float>> [#uses=1] - %mul171.i88 = fmul <4 x float> %add167.i87, %sub140.i78 ; <<4 x float>> [#uses=1] - %add172.i89 = fadd <4 x float> %mul171.i88, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1] - %bitcast176.i90 = bitcast <4 x float> %add172.i89 to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps178.i92 = and <4 x i32> %bitcast176.i90, zeroinitializer ; <<4 x i32>> [#uses=1] - %bitcast179.i93 = bitcast <4 x i32> %andnps178.i92 to <4 x float> ; <<4 x float>> [#uses=1] - %mul186.i96 = fmul <4 x float> %bitcast179.i93, zeroinitializer ; <<4 x float>> [#uses=1] - %bitcast190.i98 = bitcast <4 x float> %mul186.i96 to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps192.i100 = and <4 x i32> %bitcast190.i98, zeroinitializer ; <<4 x i32>> [#uses=1] - %xorps.i102 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1] - %orps203.i103 = or <4 x i32> %andnps192.i100, %xorps.i102 ; <<4 x i32>> [#uses=1] - %bitcast204.i104 = bitcast <4 x i32> %orps203.i103 to <4 x float> ; <<4 x float>> [#uses=1] - %cmple.i = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> %tmp51, i8 2) nounwind ; <<4 x float>> [#uses=1] - %tmp80 = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> zeroinitializer) nounwind ; <<4 x float>> [#uses=1] - %sub140.i = fsub <4 x float> zeroinitializer, %tmp80 ; <<4 x float>> [#uses=1] - %bitcast148.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps150.i = and <4 x i32> %bitcast148.i, < i32 -2139095041, i32 -2139095041, i32 -2139095041, i32 -2139095041 > ; <<4 x i32>> [#uses=0] - %mul171.i = fmul <4 x float> zeroinitializer, %sub140.i ; <<4 x float>> [#uses=1] - %add172.i = fadd <4 x float> %mul171.i, < float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000, float 0x3FF0000A40000000 > ; <<4 x float>> [#uses=1] - %bitcast176.i = bitcast <4 x float> %add172.i to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps178.i = and <4 x i32> %bitcast176.i, zeroinitializer ; <<4 x i32>> [#uses=1] - %bitcast179.i = bitcast <4 x i32> %andnps178.i to <4 x float> ; <<4 x float>> [#uses=1] - %mul186.i = fmul <4 x float> %bitcast179.i, zeroinitializer ; <<4 x float>> [#uses=1] - %bitcast189.i = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=0] - %bitcast190.i = bitcast <4 x float> %mul186.i to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps192.i = and <4 x i32> %bitcast190.i, zeroinitializer ; <<4 x i32>> [#uses=1] - %bitcast198.i = bitcast <4 x float> %cmple.i to <4 x i32> ; <<4 x i32>> [#uses=1] - %xorps.i = xor <4 x i32> %bitcast198.i, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1] - %orps203.i = or <4 x i32> %andnps192.i, %xorps.i ; <<4 x i32>> [#uses=1] - %bitcast204.i = bitcast <4 x i32> %orps203.i to <4 x float> ; <<4 x float>> [#uses=1] - %mul307 = fmul <4 x float> %bitcast204.i185, zeroinitializer ; <<4 x float>> [#uses=1] - %mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer ; <<4 x float>> [#uses=2] - %mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer ; <<4 x float>> [#uses=1] - %tmp82 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul307, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1] - %bitcast11.i15 = bitcast <4 x float> %tmp82 to <4 x i32> ; <<4 x i32>> [#uses=1] - %andnps.i17 = and <4 x i32> %bitcast11.i15, zeroinitializer ; <<4 x i32>> [#uses=1] - %orps.i18 = or <4 x i32> %andnps.i17, zeroinitializer ; <<4 x i32>> [#uses=1] - %bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float> ; <<4 x float>> [#uses=1] - %tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind ; <<4 x float>> [#uses=1] - %bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32> ; <<4 x i32>> [#uses=1] - %bitcast6.i4 = bitcast <4 x float> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=2] - %andps.i5 = and <4 x i32> %bitcast.i3, %bitcast6.i4 ; <<4 x i32>> [#uses=1] - %bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32> ; <<4 x i32>> [#uses=1] - %not.i7 = xor <4 x i32> %bitcast6.i4, < i32 -1, i32 -1, i32 -1, i32 -1 > ; <<4 x i32>> [#uses=1] - %andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7 ; <<4 x i32>> [#uses=1] - %orps.i9 = or <4 x i32> %andnps.i8, %andps.i5 ; <<4 x i32>> [#uses=1] - %bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float> ; <<4 x float>> [#uses=1] - %bitcast.i = bitcast <4 x float> %mul313 to <4 x i32> ; <<4 x i32>> [#uses=1] - %andps.i = and <4 x i32> %bitcast.i, zeroinitializer ; <<4 x i32>> [#uses=1] - %orps.i = or <4 x i32> zeroinitializer, %andps.i ; <<4 x i32>> [#uses=1] - %bitcast17.i = bitcast <4 x i32> %orps.i to <4 x float> ; <<4 x float>> [#uses=1] - call void null(<4 x float> %bitcast17.i19, <4 x float> %bitcast17.i10, <4 x float> %bitcast17.i, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind - unreachable - -afterfor: ; preds = %forcond - ret void -} - -declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone - -declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone - -declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone - -declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone - -declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll new file mode 100644 index 000000000000..5bf5dbd1a9ce --- /dev/null +++ b/test/CodeGen/X86/fold-tied-op.ll @@ -0,0 +1,84 @@ +; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s +; Regression test for http://reviews.llvm.org/D5701 + +; ModuleID = 'xxhash.i' +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i386--netbsd" + +; CHECK-LABEL: fn1 +; CHECK: shldl {{.*#+}} 4-byte Folded Spill +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: shldl {{.*#+}} 4-byte Folded Spill +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: addl {{.*#+}} 4-byte Folded Reload +; CHECK: imull {{.*#+}} 4-byte Folded Reload +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: retl + +%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 } + +@a = common global i32 0, align 4 +@b = common global i64 0, align 8 + +; Function Attrs: nounwind uwtable +define i64 @fn1() #0 { +entry: + %0 = load i32* @a, align 4, !tbaa !1 + %1 = inttoptr i32 %0 to %struct.XXH_state64_t* + %total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0 + %2 = load i32* %total_len, align 4, !tbaa !5 + %tobool = icmp eq i32 %2, 0 + br i1 %tobool, label %if.else, label %if.then + +if.then: ; preds = %entry + %v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3 + %3 = load i64* %v3, align 4, !tbaa !8 + %v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4 + %4 = load i64* %v4, align 4, !tbaa !9 + %v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2 + %5 = load i64* %v2, align 4, !tbaa !10 + %shl = shl i64 %5, 1 + %or = or i64 %shl, %5 + %shl2 = shl i64 %3, 2 + %shr = lshr i64 %3, 1 + %or3 = or i64 %shl2, %shr + %add = add i64 %or, %or3 + %mul = mul i64 %4, -4417276706812531889 + %shl4 = mul i64 %4, -8834553413625063778 + %shr5 = ashr i64 %mul, 3 + %or6 = or i64 %shr5, %shl4 + %mul7 = mul nsw i64 %or6, 1400714785074694791 + %xor = xor i64 %add, %mul7 + store i64 %xor, i64* @b, align 8, !tbaa !11 + %mul8 = mul nsw i64 %xor, 1400714785074694791 + br label %if.end + +if.else: ; preds = %entry + %6 = load i64* @b, align 8, !tbaa !11 + %xor10 = xor i64 %6, -4417276706812531889 + %mul11 = mul nsw i64 %xor10, 400714785074694791 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ] + %storemerge = add i64 %storemerge.in, -8796714831421723037 + store i64 %storemerge, i64* @b, align 8, !tbaa !11 + ret i64 undef +} + +attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.6 (trunk 219587)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} +!5 = !{!6, !2, i64 0} +!6 = !{!"XXH_state64_t", !2, i64 0, !2, i64 4, !7, i64 8, !7, i64 16, !7, i64 24} +!7 = !{!"long long", !3, i64 0} +!8 = !{!6, !7, i64 16} +!9 = !{!6, !7, i64 24} +!10 = !{!6, !7, i64 8} +!11 = !{!7, !7, i64 0} diff --git a/test/CodeGen/X86/force-align-stack-alloca.ll b/test/CodeGen/X86/force-align-stack-alloca.ll index 95defc83db1f..bd9806943920 100644 --- a/test/CodeGen/X86/force-align-stack-alloca.ll +++ b/test/CodeGen/X86/force-align-stack-alloca.ll @@ -33,14 +33,14 @@ define i64 @g(i32 %i) nounwind { ; CHECK-NOT: {{[^ ,]*}}, %esp ; ; Next we set up the memset call, and then undo it. -; CHECK: subl $32, %esp +; CHECK: subl $20, %esp ; CHECK-NOT: {{[^ ,]*}}, %esp ; CHECK: calll memset ; CHECK-NEXT: addl $32, %esp ; CHECK-NOT: {{[^ ,]*}}, %esp ; ; Next we set up the call to 'f'. -; CHECK: subl $32, %esp +; CHECK: subl $28, %esp ; CHECK-NOT: {{[^ ,]*}}, %esp ; CHECK: calll f ; CHECK-NEXT: addl $32, %esp diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll index a973befdafe7..e6c1e1adb59e 100644 --- a/test/CodeGen/X86/fp-load-trunc.ll +++ b/test/CodeGen/X86/fp-load-trunc.ll @@ -2,57 +2,87 @@ ; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX define <1 x float> @test1(<1 x double>* %p) nounwind { -; CHECK: test1 -; CHECK: cvtsd2ss -; CHECK: ret -; AVX: test1 -; AVX: vcvtsd2ss -; AVX: ret +; CHECK-LABEL: test1: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movsd (%eax), %xmm0 +; CHECK-NEXT: cvtsd2ss %xmm0, %xmm0 +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax +; CHECK-NEXT: retl +; +; AVX-LABEL: test1: +; AVX: # BB#0: +; AVX-NEXT: pushl %eax +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-NEXT: vmovsd (%eax), %xmm0 +; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovss %xmm0, (%esp) +; AVX-NEXT: flds (%esp) +; AVX-NEXT: popl %eax +; AVX-NEXT: retl %x = load <1 x double>* %p %y = fptrunc <1 x double> %x to <1 x float> ret <1 x float> %y } define <2 x float> @test2(<2 x double>* %p) nounwind { -; CHECK: test2 -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: ret -; AVX: test2 -; AVX: vcvtpd2psx {{[0-9]*}}(%{{.*}}) -; AVX: ret +; CHECK-LABEL: test2: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: cvtpd2ps (%eax), %xmm0 +; CHECK-NEXT: retl +; +; AVX-LABEL: test2: +; AVX: # BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-NEXT: vcvtpd2psx (%eax), %xmm0 +; AVX-NEXT: retl %x = load <2 x double>* %p %y = fptrunc <2 x double> %x to <2 x float> ret <2 x float> %y } define <4 x float> @test3(<4 x double>* %p) nounwind { -; CHECK: test3 -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: movlhps -; CHECK: ret -; AVX: test3 -; AVX: vcvtpd2psy {{[0-9]*}}(%{{.*}}) -; AVX: ret +; CHECK-LABEL: test3: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: cvtpd2ps 16(%eax), %xmm1 +; CHECK-NEXT: cvtpd2ps (%eax), %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retl +; +; AVX-LABEL: test3: +; AVX: # BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-NEXT: vcvtpd2psy (%eax), %xmm0 +; AVX-NEXT: retl %x = load <4 x double>* %p %y = fptrunc <4 x double> %x to <4 x float> ret <4 x float> %y } define <8 x float> @test4(<8 x double>* %p) nounwind { -; CHECK: test4 -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: movlhps -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}}) -; CHECK: movlhps -; CHECK: ret -; AVX: test4 -; AVX: vcvtpd2psy -; AVX: vcvtpd2psy -; AVX: vinsertf128 -; AVX: ret +; CHECK-LABEL: test4: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: cvtpd2ps 16(%eax), %xmm1 +; CHECK-NEXT: cvtpd2ps (%eax), %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: cvtpd2ps 48(%eax), %xmm2 +; CHECK-NEXT: cvtpd2ps 32(%eax), %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; CHECK-NEXT: retl +; +; AVX-LABEL: test4: +; AVX: # BB#0: +; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-NEXT: vcvtpd2psy (%eax), %xmm0 +; AVX-NEXT: vcvtpd2psy 32(%eax), %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retl %x = load <8 x double>* %p %y = fptrunc <8 x double> %x to <8 x float> ret <8 x float> %y diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll index 25442fcadd23..6424bfc9c219 100644 --- a/test/CodeGen/X86/fp-trunc.ll +++ b/test/CodeGen/X86/fp-trunc.ll @@ -2,55 +2,77 @@ ; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX define <1 x float> @test1(<1 x double> %x) nounwind { -; CHECK: test1 -; CHECK: cvtsd2ss -; CHECK: ret -; AVX: test1 -; AVX: vcvtsd2ss -; AVX: ret +; CHECK-LABEL: test1: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movsd {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: cvtsd2ss %xmm0, %xmm0 +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax +; CHECK-NEXT: retl +; +; AVX-LABEL: test1: +; AVX: # BB#0: +; AVX-NEXT: pushl %eax +; AVX-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 +; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovss %xmm0, (%esp) +; AVX-NEXT: flds (%esp) +; AVX-NEXT: popl %eax +; AVX-NEXT: retl %y = fptrunc <1 x double> %x to <1 x float> ret <1 x float> %y } define <2 x float> @test2(<2 x double> %x) nounwind { -; CHECK: test2 -; CHECK: cvtpd2ps -; CHECK: ret -; AVX: test2 -; AVX-NOT: vcvtpd2psy -; AVX: vcvtpd2ps -; AVX: ret +; CHECK-LABEL: test2: +; CHECK: # BB#0: +; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 +; CHECK-NEXT: retl +; +; AVX-LABEL: test2: +; AVX: # BB#0: +; AVX-NEXT: vcvtpd2ps %xmm0, %xmm0 +; AVX-NEXT: retl %y = fptrunc <2 x double> %x to <2 x float> ret <2 x float> %y } define <4 x float> @test3(<4 x double> %x) nounwind { -; CHECK: test3 -; CHECK: cvtpd2ps -; CHECK: cvtpd2ps -; CHECK: movlhps -; CHECK: ret -; AVX: test3 -; AVX: vcvtpd2psy -; AVX: ret +; CHECK-LABEL: test3: +; CHECK: # BB#0: +; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 +; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retl +; +; AVX-LABEL: test3: +; AVX: # BB#0: +; AVX-NEXT: vcvtpd2psy %ymm0, %xmm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: retl %y = fptrunc <4 x double> %x to <4 x float> ret <4 x float> %y } define <8 x float> @test4(<8 x double> %x) nounwind { -; CHECK: test4 -; CHECK: cvtpd2ps -; CHECK: cvtpd2ps -; CHECK: movlhps -; CHECK: cvtpd2ps -; CHECK: cvtpd2ps -; CHECK: movlhps -; CHECK: ret -; AVX: test4 -; AVX: vcvtpd2psy -; AVX: vcvtpd2psy -; AVX: vinsertf128 -; AVX: ret +; CHECK-LABEL: test4: +; CHECK: # BB#0: +; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 +; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: cvtpd2ps %xmm3, %xmm3 +; CHECK-NEXT: cvtpd2ps %xmm2, %xmm1 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: retl +; +; AVX-LABEL: test4: +; AVX: # BB#0: +; AVX-NEXT: vcvtpd2psy %ymm0, %xmm0 +; AVX-NEXT: vcvtpd2psy %ymm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retl %y = fptrunc <8 x double> %x to <8 x float> ret <8 x float> %y } diff --git a/test/CodeGen/X86/fpstack-debuginstr-kill.ll b/test/CodeGen/X86/fpstack-debuginstr-kill.ll new file mode 100644 index 000000000000..e3180f4e68a2 --- /dev/null +++ b/test/CodeGen/X86/fpstack-debuginstr-kill.ll @@ -0,0 +1,71 @@ +; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin -no-integrated-as + +@g1 = global double 0.000000e+00, align 8 +@g2 = global i32 0, align 4 + +define void @_Z16fpuop_arithmeticjj(i32, i32) { +entry: + switch i32 undef, label %sw.bb.i1921 [ + ] + +sw.bb261: ; preds = %entry, %entry + unreachable + +sw.bb.i1921: ; preds = %if.end504 + switch i32 undef, label %if.end511 [ + i32 1, label %sw.bb27.i + ] + +sw.bb27.i: ; preds = %sw.bb.i1921 + %conv.i.i1923 = fpext float undef to x86_fp80 + br label %if.end511 + +if.end511: ; preds = %sw.bb27.i, %sw.bb13.i + %src.sroa.0.0.src.sroa.0.0.2280 = phi x86_fp80 [ %conv.i.i1923, %sw.bb27.i ], [ undef, %sw.bb.i1921 ] + switch i32 undef, label %sw.bb992 [ + i32 3, label %sw.bb735 + i32 18, label %if.end41.i2210 + ] + +sw.bb735: ; preds = %if.end511 + %2 = call x86_fp80 asm sideeffect "frndint", "={st},0,~{dirflag},~{fpsr},~{flags}"(x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280) + unreachable + +if.end41.i2210: ; preds = %if.end511 + call void @llvm.dbg.value(metadata x86_fp80 %src.sroa.0.0.src.sroa.0.0.2280, i64 0, metadata !20, metadata !{!"0x102"}) + unreachable + +sw.bb992: ; preds = %if.end511 + ret void +} + +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!24, !25} +!0 = !{!"0x11\004\00clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)\001\00\000\00\001", !1, !2, !2, !3, !21, !2} ; [ DW_TAG_compile_unit ] [x87stackifier/fpu_ieee.cpp] [DW_LANG_C_plus_plus] +!1 = !{!"fpu_ieee.cpp", !"x87stackifier"} +!2 = !{} +!3 = !{!4} +!4 = !{!"0x2e\00fpuop_arithmetic\00fpuop_arithmetic\00_Z16fpuop_arithmeticjj\0011\000\001\000\006\00256\001\0013", !5, !6, !7, null, void (i32, i32)* @_Z16fpuop_arithmeticjj, null, null, !10} ; [ DW_TAG_subprogram ] [line 11] [def] [scope 13] [fpuop_arithmetic] +!5 = !{!"f1.cpp", !"x87stackifier"} +!6 = !{!"0x29", !5} ; [ DW_TAG_file_type ] [x87stackifier/f1.cpp] +!7 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!8 = !{null, !9, !9} +!9 = !{!"0x24\00unsigned int\000\0032\0032\000\000\007", null, null} ; [ DW_TAG_base_type ] [unsigned int] [line 0, size 32, align 32, offset 0, enc DW_ATE_unsigned] +!10 = !{!11, !12, !13, !18, !20} +!11 = !{!"0x101\00\0016777227\000", !4, !6, !9} ; [ DW_TAG_arg_variable ] [line 11] +!12 = !{!"0x101\00\0033554443\000", !4, !6, !9} ; [ DW_TAG_arg_variable ] [line 11] +!13 = !{!"0x100\00x\0014\000", !4, !6, !14} ; [ DW_TAG_auto_variable ] [x] [line 14] +!14 = !{!"0x16\00fpu_extended\003\000\000\000\000", !5, null, !15} ; [ DW_TAG_typedef ] [fpu_extended] [line 3, size 0, align 0, offset 0] [from fpu_register] +!15 = !{!"0x16\00fpu_register\002\000\000\000\000", !5, null, !16} ; [ DW_TAG_typedef ] [fpu_register] [line 2, size 0, align 0, offset 0] [from uae_f64] +!16 = !{!"0x16\00uae_f64\001\000\000\000\000", !5, null, !17} ; [ DW_TAG_typedef ] [uae_f64] [line 1, size 0, align 0, offset 0] [from double] +!17 = !{!"0x24\00double\000\0064\0064\000\000\004", null, null} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float] +!18 = !{!"0x100\00a\0015\000", !4, !6, !19} ; [ DW_TAG_auto_variable ] [a] [line 15] +!19 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed] +!20 = !{!"0x100\00value\0016\000", !4, !6, !14} ; [ DW_TAG_auto_variable ] [value] [line 16] +!21 = !{!22, !23} +!22 = !{!"0x34\00g1\00g1\00\005\000\001", null, !6, !14, double* @g1, null} ; [ DW_TAG_variable ] [g1] [line 5] [def] +!23 = !{!"0x34\00g2\00g2\00\006\000\001", null, !6, !19, i32* @g2, null} ; [ DW_TAG_variable ] [g2] [line 6] [def] +!24 = !{i32 2, !"Dwarf Version", i32 2} +!25 = !{i32 2, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/frameaddr.ll b/test/CodeGen/X86/frameaddr.ll index 6c1ca252bb97..3e0f8bc34d64 100644 --- a/test/CodeGen/X86/frameaddr.ll +++ b/test/CodeGen/X86/frameaddr.ll @@ -2,6 +2,10 @@ ; RUN: llc < %s -march=x86 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-32 ; RUN: llc < %s -march=x86-64 | FileCheck %s --check-prefix=CHECK-64 ; RUN: llc < %s -march=x86-64 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-64 +; RUN: llc < %s -mtriple=x86_64-gnux32 | FileCheck %s --check-prefix=CHECK-X32ABI +; RUN: llc < %s -mtriple=x86_64-gnux32 -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-X32ABI +; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s --check-prefix=CHECK-NACL64 +; RUN: llc < %s -mtriple=x86_64-nacl -fast-isel -fast-isel-abort | FileCheck %s --check-prefix=CHECK-NACL64 define i8* @test1() nounwind { entry: @@ -17,6 +21,16 @@ entry: ; CHECK-64-NEXT: movq %rbp, %rax ; CHECK-64-NEXT: pop ; CHECK-64-NEXT: ret +; CHECK-X32ABI-LABEL: test1 +; CHECK-X32ABI: pushq %rbp +; CHECK-X32ABI-NEXT: movl %esp, %ebp +; CHECK-X32ABI-NEXT: movl %ebp, %eax +; CHECK-X32ABI-NEXT: popq %rbp +; CHECK-X32ABI-NEXT: ret +; CHECK-NACL64-LABEL: test1 +; CHECK-NACL64: pushq %rbp +; CHECK-NACL64-NEXT: movq %rsp, %rbp +; CHECK-NACL64-NEXT: movl %ebp, %eax %0 = tail call i8* @llvm.frameaddress(i32 0) ret i8* %0 } @@ -37,6 +51,18 @@ entry: ; CHECK-64-NEXT: movq (%rax), %rax ; CHECK-64-NEXT: pop ; CHECK-64-NEXT: ret +; CHECK-X32ABI-LABEL: test2 +; CHECK-X32ABI: pushq %rbp +; CHECK-X32ABI-NEXT: movl %esp, %ebp +; CHECK-X32ABI-NEXT: movl (%ebp), %eax +; CHECK-X32ABI-NEXT: movl (%eax), %eax +; CHECK-X32ABI-NEXT: popq %rbp +; CHECK-X32ABI-NEXT: ret +; CHECK-NACL64-LABEL: test2 +; CHECK-NACL64: pushq %rbp +; CHECK-NACL64-NEXT: movq %rsp, %rbp +; CHECK-NACL64-NEXT: movl (%ebp), %eax +; CHECK-NACL64-NEXT: movl (%eax), %eax %0 = tail call i8* @llvm.frameaddress(i32 2) ret i8* %0 } diff --git a/test/CodeGen/X86/frameallocate.ll b/test/CodeGen/X86/frameallocate.ll new file mode 100644 index 000000000000..13d35b91937d --- /dev/null +++ b/test/CodeGen/X86/frameallocate.ll @@ -0,0 +1,39 @@ +; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s + +declare i8* @llvm.frameallocate(i32) +declare i8* @llvm.frameaddress(i32) +declare i8* @llvm.framerecover(i8*, i8*) +declare i32 @printf(i8*, ...) + +@str = internal constant [10 x i8] c"asdf: %d\0A\00" + +define void @print_framealloc_from_fp(i8* %fp) { + %alloc = call i8* @llvm.framerecover(i8* bitcast (void(i32*, i32*)* @alloc_func to i8*), i8* %fp) + %alloc_i32 = bitcast i8* %alloc to i32* + %r = load i32* %alloc_i32 + call i32 (i8*, ...)* @printf(i8* getelementptr ([10 x i8]* @str, i32 0, i32 0), i32 %r) + ret void +} + +; CHECK-LABEL: print_framealloc_from_fp: +; CHECK: movabsq $.Lframeallocation_alloc_func, %[[offs:[a-z]+]] +; CHECK: movl (%rcx,%[[offs]]), %edx +; CHECK: leaq {{.*}}(%rip), %rcx +; CHECK: callq printf +; CHECK: retq + +define void @alloc_func(i32* %s, i32* %d) { + %alloc = call i8* @llvm.frameallocate(i32 16) + %alloc_i32 = bitcast i8* %alloc to i32* + store i32 42, i32* %alloc_i32 + %fp = call i8* @llvm.frameaddress(i32 0) + call void @print_framealloc_from_fp(i8* %fp) + ret void +} + +; CHECK-LABEL: alloc_func: +; CHECK: .Lframeallocation_alloc_func = -[[offs:[0-9]+]] +; CHECK: movl $42, -[[offs]](%rbp) +; CHECK: movq %rbp, %rcx +; CHECK: callq print_framealloc_from_fp +; CHECK: retq diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll index 5f48b1e32b16..6d397b211481 100644 --- a/test/CodeGen/X86/gather-addresses.ll +++ b/test/CodeGen/X86/gather-addresses.ll @@ -1,35 +1,38 @@ ; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN ; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=WIN +; RUN: llc -mtriple=i686-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN32 ; rdar://7398554 ; When doing vector gather-scatter index calculation with 32-bit indices, -; bounce the vector off of cache rather than shuffling each individual +; use an efficient mov/shift sequence rather than shuffling each individual ; element out of the index vector. -; CHECK: foo: -; LIN: movaps (%rsi), %xmm0 -; LIN: andps (%rdx), %xmm0 -; LIN: movaps %xmm0, -24(%rsp) -; LIN: movslq -24(%rsp), %[[REG1:r.+]] -; LIN: movslq -20(%rsp), %[[REG2:r.+]] -; LIN: movslq -16(%rsp), %[[REG3:r.+]] -; LIN: movslq -12(%rsp), %[[REG4:r.+]] -; LIN: movsd (%rdi,%[[REG1]],8), %xmm0 -; LIN: movhpd (%rdi,%[[REG2]],8), %xmm0 -; LIN: movsd (%rdi,%[[REG3]],8), %xmm1 -; LIN: movhpd (%rdi,%[[REG4]],8), %xmm1 +; CHECK-LABEL: foo: +; LIN: movdqa (%rsi), %xmm0 +; LIN: pand (%rdx), %xmm0 +; LIN: pextrq $1, %xmm0, %r[[REG4:.+]] +; LIN: movd %xmm0, %r[[REG2:.+]] +; LIN: movslq %e[[REG2]], %r[[REG1:.+]] +; LIN: sarq $32, %r[[REG2]] +; LIN: movslq %e[[REG4]], %r[[REG3:.+]] +; LIN: sarq $32, %r[[REG4]] +; LIN: movsd (%rdi,%r[[REG1]],8), %xmm0 +; LIN: movhpd (%rdi,%r[[REG2]],8), %xmm0 +; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1 +; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1 -; WIN: movaps (%rdx), %xmm0 -; WIN: andps (%r8), %xmm0 -; WIN: movaps %xmm0, (%rsp) -; WIN: movslq (%rsp), %[[REG1:r.+]] -; WIN: movslq 4(%rsp), %[[REG2:r.+]] -; WIN: movslq 8(%rsp), %[[REG3:r.+]] -; WIN: movslq 12(%rsp), %[[REG4:r.+]] -; WIN: movsd (%rcx,%[[REG1]],8), %xmm0 -; WIN: movhpd (%rcx,%[[REG2]],8), %xmm0 -; WIN: movsd (%rcx,%[[REG3]],8), %xmm1 -; WIN: movhpd (%rcx,%[[REG4]],8), %xmm1 +; WIN: movdqa (%rdx), %xmm0 +; WIN: pand (%r8), %xmm0 +; WIN: pextrq $1, %xmm0, %r[[REG4:.+]] +; WIN: movd %xmm0, %r[[REG2:.+]] +; WIN: movslq %e[[REG2]], %r[[REG1:.+]] +; WIN: sarq $32, %r[[REG2]] +; WIN: movslq %e[[REG4]], %r[[REG3:.+]] +; WIN: sarq $32, %r[[REG4]] +; WIN: movsd (%rcx,%r[[REG1]],8), %xmm0 +; WIN: movhpd (%rcx,%r[[REG2]],8), %xmm0 +; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1 +; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1 define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { %a = load <4 x i32>* %i @@ -53,3 +56,35 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind { %v3 = insertelement <4 x double> %v2, double %r3, i32 3 ret <4 x double> %v3 } + +; Check that the sequence previously used above, which bounces the vector off the +; cache works for x86-32. Note that in this case it will not be used for index +; calculation, since indexes are 32-bit, not 64. +; CHECK-LABEL: old: +; LIN32: movaps %xmm0, (%esp) +; LIN32-DAG: {{(mov|and)}}l (%esp), +; LIN32-DAG: {{(mov|and)}}l 4(%esp), +; LIN32-DAG: {{(mov|and)}}l 8(%esp), +; LIN32-DAG: {{(mov|and)}}l 12(%esp), +define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind { + %a = load <4 x i32>* %i + %b = load <4 x i32>* %h + %j = and <4 x i32> %a, %b + %d0 = extractelement <4 x i32> %j, i32 0 + %d1 = extractelement <4 x i32> %j, i32 1 + %d2 = extractelement <4 x i32> %j, i32 2 + %d3 = extractelement <4 x i32> %j, i32 3 + %q0 = zext i32 %d0 to i64 + %q1 = zext i32 %d1 to i64 + %q2 = zext i32 %d2 to i64 + %q3 = zext i32 %d3 to i64 + %r0 = and i64 %q0, %f + %r1 = and i64 %q1, %f + %r2 = and i64 %q2, %f + %r3 = and i64 %q3, %f + %v0 = insertelement <4 x i64> undef, i64 %r0, i32 0 + %v1 = insertelement <4 x i64> %v0, i64 %r1, i32 1 + %v2 = insertelement <4 x i64> %v1, i64 %r2, i32 2 + %v3 = insertelement <4 x i64> %v2, i64 %r3, i32 3 + ret <4 x i64> %v3 +} diff --git a/test/CodeGen/X86/gcc_except_table_functions.ll b/test/CodeGen/X86/gcc_except_table_functions.ll new file mode 100644 index 000000000000..4a8168050e56 --- /dev/null +++ b/test/CodeGen/X86/gcc_except_table_functions.ll @@ -0,0 +1,53 @@ +; RUN: llc -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s + +; This test demonstrates that it is possible to use functions for typeinfo +; instead of global variables. While __gxx_personality_v0 would never know what +; to do with them, other EH schemes such as SEH might use them. + +declare i32 @__gxx_personality_v0(...) +declare void @filt0() +declare void @filt1() +declare void @_Z1fv() +declare i32 @llvm.eh.typeid.for(i8*) + +define i32 @main() uwtable { +entry: + invoke void @_Z1fv() + to label %try.cont unwind label %lpad + +try.cont: + ret i32 0 + +lpad: + %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + catch i8* bitcast (void ()* @filt0 to i8*) + catch i8* bitcast (void ()* @filt1 to i8*) + %sel = extractvalue { i8*, i32 } %0, 1 + %id0 = call i32 @llvm.eh.typeid.for(i8* bitcast (void ()* @filt0 to i8*)) + %is_f0 = icmp eq i32 %sel, %id0 + br i1 %is_f0, label %try.cont, label %check_f1 + +check_f1: + %id1 = call i32 @llvm.eh.typeid.for(i8* bitcast (void ()* @filt1 to i8*)) + %is_f1 = icmp eq i32 %sel, %id1 + br i1 %is_f1, label %try.cont, label %eh.resume + +eh.resume: + resume { i8*, i32 } %0 +} + +; CHECK-LABEL: main: +; CHECK: .cfi_startproc +; CHECK: .cfi_personality 3, __gxx_personality_v0 +; CHECK: .cfi_lsda 3, .Lexception0 +; CHECK: .cfi_def_cfa_offset 16 +; CHECK: callq _Z1fv +; CHECK: retq +; CHECK: cmpl $2, %edx +; CHECK: je +; CHECK: cmpl $1, %edx +; CHECK: je +; CHECK: callq _Unwind_Resume +; CHECK: .cfi_endproc +; CHECK: GCC_except_table0: +; CHECK: Lexception0: diff --git a/test/CodeGen/X86/ghc-cc.ll b/test/CodeGen/X86/ghc-cc.ll index 4dba2c086329..3ada8c8ce98e 100644 --- a/test/CodeGen/X86/ghc-cc.ll +++ b/test/CodeGen/X86/ghc-cc.ll @@ -12,13 +12,13 @@ entry: ; CHECK: movl {{[0-9]*}}(%esp), %ebx ; CHECK-NEXT: movl {{[0-9]*}}(%esp), %ebp ; CHECK-NEXT: calll addtwo - %0 = call cc 10 i32 @addtwo(i32 %a, i32 %b) + %0 = call ghccc i32 @addtwo(i32 %a, i32 %b) ; CHECK: calll foo call void @foo() nounwind ret void } -define cc 10 i32 @addtwo(i32 %x, i32 %y) nounwind { +define ghccc i32 @addtwo(i32 %x, i32 %y) nounwind { entry: ; CHECK: leal (%ebx,%ebp), %eax %0 = add i32 %x, %y @@ -26,7 +26,7 @@ entry: ret i32 %0 } -define cc 10 void @foo() nounwind { +define ghccc void @foo() nounwind { entry: ; CHECK: movl r1, %esi ; CHECK-NEXT: movl hp, %edi @@ -37,8 +37,8 @@ entry: %2 = load i32* @sp %3 = load i32* @base ; CHECK: jmp bar - tail call cc 10 void @bar( i32 %3, i32 %2, i32 %1, i32 %0 ) nounwind + tail call ghccc void @bar( i32 %3, i32 %2, i32 %1, i32 %0 ) nounwind ret void } -declare cc 10 void @bar(i32, i32, i32, i32) +declare ghccc void @bar(i32, i32, i32, i32) diff --git a/test/CodeGen/X86/ghc-cc64.ll b/test/CodeGen/X86/ghc-cc64.ll index 403391e81658..7251dd673b30 100644 --- a/test/CodeGen/X86/ghc-cc64.ll +++ b/test/CodeGen/X86/ghc-cc64.ll @@ -25,13 +25,13 @@ entry: ; CHECK: movq %rdi, %r13 ; CHECK-NEXT: movq %rsi, %rbp ; CHECK-NEXT: callq addtwo - %0 = call cc 10 i64 @addtwo(i64 %a, i64 %b) + %0 = call ghccc i64 @addtwo(i64 %a, i64 %b) ; CHECK: callq foo call void @foo() nounwind ret void } -define cc 10 i64 @addtwo(i64 %x, i64 %y) nounwind { +define ghccc i64 @addtwo(i64 %x, i64 %y) nounwind { entry: ; CHECK: leaq (%r13,%rbp), %rax %0 = add i64 %x, %y @@ -39,7 +39,7 @@ entry: ret i64 %0 } -define cc 10 void @foo() nounwind { +define ghccc void @foo() nounwind { entry: ; CHECK: movsd d2(%rip), %xmm6 ; CHECK-NEXT: movsd d1(%rip), %xmm5 @@ -74,12 +74,12 @@ entry: %14 = load i64* @sp %15 = load i64* @base ; CHECK: jmp bar - tail call cc 10 void @bar( i64 %15, i64 %14, i64 %13, i64 %12, i64 %11, + tail call ghccc void @bar( i64 %15, i64 %14, i64 %13, i64 %12, i64 %11, i64 %10, i64 %9, i64 %8, i64 %7, i64 %6, float %5, float %4, float %3, float %2, double %1, double %0 ) nounwind ret void } -declare cc 10 void @bar(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, +declare ghccc void @bar(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, float, float, float, float, double, double) diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll index c763f3947e59..fa1169d8a8e3 100644 --- a/test/CodeGen/X86/global-sections.ll +++ b/test/CodeGen/X86/global-sections.ll @@ -53,21 +53,20 @@ define void @F1() { ; _Complex long long const G4 = 34; -@G4 = unnamed_addr constant {i64,i64} { i64 34, i64 0 } +@G4 = private unnamed_addr constant {i64,i64} { i64 34, i64 0 } ; DARWIN: .section __TEXT,__literal16,16byte_literals -; DARWIN: _G4: +; DARWIN: L_G4: ; DARWIN: .long 34 ; DARWIN-STATIC: .section __TEXT,__literal16,16byte_literals -; DARWIN-STATIC: _G4: +; DARWIN-STATIC: L_G4: ; DARWIN-STATIC: .long 34 ; DARWIN64: .section __TEXT,__literal16,16byte_literals -; DARWIN64: _G4: +; DARWIN64: L_G4: ; DARWIN64: .quad 34 - ; int G5 = 47; @G5 = global i32 47 @@ -194,3 +193,23 @@ define void @F1() { ; WIN32-SECTIONS: L_G14: ; WIN32-SECTIONS: .asciz "foo" +; cannot be merged on MachO, but can on other formats. +@G15 = unnamed_addr constant i64 0 + +; LINUX: .section .rodata.cst8,"aM",@progbits,8 +; LINUX: G15: + +; DARWIN: .section __TEXT,__const +; DARWIN: _G15: + +; DARWIN-STATIC: .section __TEXT,__const +; DARWIN-STATIC: _G15: + +; DARWIN64: .section __TEXT,__const +; DARWIN64: _G15: + +; LINUX-SECTIONS: .section .rodata.G15,"aM",@progbits,8 +; LINUX-SECTIONS: G15: + +; WIN32-SECTIONS: .section .rdata,"rd",one_only,_G15 +; WIN32-SECTIONS: _G15: diff --git a/test/CodeGen/X86/hoist-invariant-load.ll b/test/CodeGen/X86/hoist-invariant-load.ll index 34191e3f9a31..c9e52903c79e 100644 --- a/test/CodeGen/X86/hoist-invariant-load.ll +++ b/test/CodeGen/X86/hoist-invariant-load.ll @@ -27,4 +27,4 @@ for.end: ; preds = %for.body declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind -!0 = metadata !{} +!0 = !{} diff --git a/test/CodeGen/X86/i8-umulo.ll b/test/CodeGen/X86/i8-umulo.ll deleted file mode 100644 index ba846f3e9be3..000000000000 --- a/test/CodeGen/X86/i8-umulo.ll +++ /dev/null @@ -1,24 +0,0 @@ -; RUN: llc -mcpu=generic -march=x86 < %s | FileCheck %s -; PR19858 - -declare {i8, i1} @llvm.umul.with.overflow.i8(i8 %a, i8 %b) -define i8 @testumulo(i32 %argc) { -; CHECK: imulw -; CHECK: testb %{{.+}}, %{{.+}} -; CHECK: je [[NOOVERFLOWLABEL:.+]] -; CHECK: {{.*}}[[NOOVERFLOWLABEL]]: -; CHECK-NEXT: movb -; CHECK-NEXT: retl -top: - %RHS = trunc i32 %argc to i8 - %umul = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 25, i8 %RHS) - %ex = extractvalue { i8, i1 } %umul, 1 - br i1 %ex, label %overflow, label %nooverlow - -overflow: - ret i8 %RHS - -nooverlow: - %umul.value = extractvalue { i8, i1 } %umul, 0 - ret i8 %umul.value -} diff --git a/test/CodeGen/X86/ident-metadata.ll b/test/CodeGen/X86/ident-metadata.ll index a5686730cee9..e08738f47df7 100644 --- a/test/CodeGen/X86/ident-metadata.ll +++ b/test/CodeGen/X86/ident-metadata.ll @@ -5,5 +5,5 @@ ; CHECK: .ident "clang version x.x" ; CHECK-NEXT: .ident "something else" !llvm.ident = !{!0, !1} -!0 = metadata !{metadata !"clang version x.x"} -!1 = metadata !{metadata !"something else"} +!0 = !{!"clang version x.x"} +!1 = !{!"something else"} diff --git a/test/CodeGen/X86/inalloca-ctor.ll b/test/CodeGen/X86/inalloca-ctor.ll index 7cfa92913578..b1781d30f913 100644 --- a/test/CodeGen/X86/inalloca-ctor.ll +++ b/test/CodeGen/X86/inalloca-ctor.ll @@ -17,16 +17,16 @@ entry: ; CHECK: movl %esp, call void @Foo_ctor(%Foo* %c) ; CHECK: leal 12(%{{.*}}), -; CHECK: subl $4, %esp -; CHECK: calll _Foo_ctor +; CHECK-NEXT: pushl +; CHECK-NEXT: calll _Foo_ctor ; CHECK: addl $4, %esp %b = getelementptr %frame* %args, i32 0, i32 1 store i32 42, i32* %b ; CHECK: movl $42, %a = getelementptr %frame* %args, i32 0, i32 0 call void @Foo_ctor(%Foo* %a) -; CHECK: subl $4, %esp -; CHECK: calll _Foo_ctor +; CHECK-NEXT: pushl +; CHECK-NEXT: calll _Foo_ctor ; CHECK: addl $4, %esp call void @f(%frame* inalloca %args) ; CHECK: calll _f diff --git a/test/CodeGen/X86/inalloca-invoke.ll b/test/CodeGen/X86/inalloca-invoke.ll index 6cff9ac0640c..b56f24d99628 100644 --- a/test/CodeGen/X86/inalloca-invoke.ll +++ b/test/CodeGen/X86/inalloca-invoke.ll @@ -37,7 +37,7 @@ blah: invoke.cont: call void @begin(%Iter* sret %beg) -; CHECK: movl %[[beg]], +; CHECK: pushl %[[beg]] ; CHECK: calll _begin invoke void @reverse(%frame.reverse* inalloca align 4 %rev_args) diff --git a/test/CodeGen/X86/inalloca-regparm.ll b/test/CodeGen/X86/inalloca-regparm.ll new file mode 100644 index 000000000000..9dd916bfbb37 --- /dev/null +++ b/test/CodeGen/X86/inalloca-regparm.ll @@ -0,0 +1,15 @@ +; RUN: llc -mtriple=i686-windows-msvc < %s -o /dev/null +; RUN: not llc -mtriple=x86_64-windows-msvc %s -o /dev/null 2>&1 | FileCheck %s + +; This will compile successfully on x86 but not x86_64, because %b will become a +; register parameter. + +declare x86_thiscallcc i32 @f(i32 %a, i32* inalloca %b) +define void @g() { + %b = alloca inalloca i32 + store i32 2, i32* %b + call x86_thiscallcc i32 @f(i32 0, i32* inalloca %b) + ret void +} + +; CHECK: cannot use inalloca attribute on a register parameter diff --git a/test/CodeGen/X86/inalloca-stdcall.ll b/test/CodeGen/X86/inalloca-stdcall.ll index 54f97d99a9c7..e5b07e262c7b 100644 --- a/test/CodeGen/X86/inalloca-stdcall.ll +++ b/test/CodeGen/X86/inalloca-stdcall.ll @@ -19,7 +19,7 @@ define void @g() { call x86_stdcallcc void @f(%Foo* inalloca %b) ; CHECK: calll _f@8 ; CHECK-NOT: %esp -; CHECK: subl $4, %esp +; CHECK: pushl ; CHECK: calll _i@4 call x86_stdcallcc void @i(i32 0) ret void diff --git a/test/CodeGen/X86/inline-asm-flag-clobber.ll b/test/CodeGen/X86/inline-asm-flag-clobber.ll index bb7c33e422ed..0874b51af6a5 100644 --- a/test/CodeGen/X86/inline-asm-flag-clobber.ll +++ b/test/CodeGen/X86/inline-asm-flag-clobber.ll @@ -29,4 +29,4 @@ entry: ret i32 %1 } -!0 = metadata !{i64 935930} +!0 = !{i64 935930} diff --git a/test/CodeGen/X86/inline-asm-fpstack.ll b/test/CodeGen/X86/inline-asm-fpstack.ll index 91c477baaa51..bb3778a28116 100644 --- a/test/CodeGen/X86/inline-asm-fpstack.ll +++ b/test/CodeGen/X86/inline-asm-fpstack.ll @@ -340,3 +340,65 @@ entry: %0 = tail call i32 asm "fcomi $2, $1; pushf; pop $0", "=r,{st},{st(1)},~{dirflag},~{fpsr},~{flags}"(double 2.000000e+00, double 2.000000e+00) nounwind ret i32 %0 } + +; <rdar://problem/16952634> +; X87 stackifier asserted when there was an ST register defined by an +; inline-asm instruction and the ST register was live across another +; inline-asm instruction. +; +; INLINEASM <es:frndint> [sideeffect] [attdialect], $0:[regdef], %ST0<imp-def,tied5>, $1:[reguse tiedto:$0], %ST0<tied3>, $2:[clobber], %EFLAGS<earlyclobber,imp-def,dead> +; INLINEASM <es:fldcw $0> [sideeffect] [mayload] [attdialect], $0:[mem], %EAX<undef>, 1, %noreg, 0, %noreg, $1:[clobber], %EFLAGS<earlyclobber,imp-def,dead> +; %FP0<def> = COPY %ST0 + +; CHECK-LABEL: _test_live_st +; CHECK: ## InlineAsm Start +; CHECK: frndint +; CHECK: ## InlineAsm End +; CHECK: ## InlineAsm Start +; CHECK: fldcw +; CHECK: ## InlineAsm End + +%struct.fpu_t = type { [8 x x86_fp80], x86_fp80, %struct.anon1, %struct.anon2, i32, i8, [15 x i8] } +%struct.anon1 = type { i32, i32, i32 } +%struct.anon2 = type { i32, i32, i32, i32 } + +@fpu = external global %struct.fpu_t, align 16 + +; Function Attrs: ssp +define void @test_live_st(i32 %a1) { +entry: + %0 = load x86_fp80* undef, align 16 + %cond = icmp eq i32 %a1, 1 + br i1 %cond, label %sw.bb4.i, label %_Z5tointRKe.exit + +sw.bb4.i: + %1 = call x86_fp80 asm sideeffect "frndint", "={st},0,~{dirflag},~{fpsr},~{flags}"(x86_fp80 %0) + call void asm sideeffect "fldcw $0", "*m,~{dirflag},~{fpsr},~{flags}"(i32* undef) + br label %_Z5tointRKe.exit + +_Z5tointRKe.exit: + %result.0.i = phi x86_fp80 [ %1, %sw.bb4.i ], [ %0, %entry ] + %conv.i1814 = fptosi x86_fp80 %result.0.i to i32 + %conv626 = sitofp i32 %conv.i1814 to x86_fp80 + store x86_fp80 %conv626, x86_fp80* getelementptr inbounds (%struct.fpu_t* @fpu, i32 0, i32 1) + br label %return + +return: + ret void +} + +; Check that x87 stackifier is correctly rewriting FP registers to ST registers. +; +; CHECK-LABEL: _test_operand_rewrite +; CHECK: ## InlineAsm Start +; CHECK: foo %st(0), %st(1) +; CHECK: ## InlineAsm End + +define double @test_operand_rewrite() { +entry: + %0 = tail call { double, double } asm sideeffect "foo $0, $1", "={st},={st(1)},~{dirflag},~{fpsr},~{flags}"() + %asmresult = extractvalue { double, double } %0, 0 + %asmresult1 = extractvalue { double, double } %0, 1 + %sub = fsub double %asmresult, %asmresult1 + ret double %sub +} diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll index d4174539f2f9..dfa8aed46463 100644 --- a/test/CodeGen/X86/jump_sign.ll +++ b/test/CodeGen/X86/jump_sign.ll @@ -284,7 +284,7 @@ entry: define i32 @func_test1(i32 %p1) nounwind uwtable { entry: ; CHECK-LABEL: func_test1: -; CHECK: testb +; CHECK: andb ; CHECK: j ; CHECK: ret %0 = load i32* @b, align 4 diff --git a/test/CodeGen/X86/jump_table_alias.ll b/test/CodeGen/X86/jump_table_alias.ll index f3691fda221e..20622009e376 100644 --- a/test/CodeGen/X86/jump_table_alias.ll +++ b/test/CodeGen/X86/jump_table_alias.ll @@ -5,7 +5,7 @@ entry: ret i32 0 } -@i = alias internal i32 ()* @f +@i = internal alias i32 ()* @f @j = alias i32 ()* @f define i32 @main(i32 %argc, i8** %argv) { @@ -25,7 +25,6 @@ define i32 @main(i32 %argc, i8** %argv) { ; There should only be one table, even though there are two GlobalAliases, ; because they both alias the same value. -; CHECK: .globl __llvm_jump_instr_table_0_1 ; CHECK: .align 8, 0x90 ; CHECK: .type __llvm_jump_instr_table_0_1,@function ; CHECK: __llvm_jump_instr_table_0_1: diff --git a/test/CodeGen/X86/jump_table_align.ll b/test/CodeGen/X86/jump_table_align.ll new file mode 100644 index 000000000000..6ad48d1f54f7 --- /dev/null +++ b/test/CodeGen/X86/jump_table_align.ll @@ -0,0 +1,29 @@ +; RUN: llc -filetype=obj <%s -jump-table-type=single -o %t1 +; RUN: llvm-objdump -triple=x86_64-unknown-linux-gnu -d %t1 | FileCheck %s +target triple = "x86_64-unknown-linux-gnu" +define i32 @f() unnamed_addr jumptable { + ret i32 0 +} + +define i32 @g(i8* %a) unnamed_addr jumptable { + ret i32 0 +} + +define void @h(void ()* %func) unnamed_addr jumptable { + ret void +} + +define i32 @main() { + %g = alloca i32 (...)*, align 8 + store i32 (...)* bitcast (i32 ()* @f to i32 (...)*), i32 (...)** %g, align 8 + %1 = load i32 (...)** %g, align 8 + %call = call i32 (...)* %1() + call void (void ()*)* @h(void ()* bitcast (void (void ()*)* @h to void ()*)) + %a = call i32 (i32*)* bitcast (i32 (i8*)* @g to i32(i32*)*)(i32* null) + ret i32 %a +} + +; Make sure that the padding from getJumpInstrTableEntryBound is right. +; CHECK: __llvm_jump_instr_table_0_1: +; CHECK-NEXT: e9 00 00 00 00 jmp 0 +; CHECK-NEXT: 0f 1f 00 nopl (%rax) diff --git a/test/CodeGen/X86/jump_table_bitcast.ll b/test/CodeGen/X86/jump_table_bitcast.ll index 33a798f7a6b7..749b77a166ea 100644 --- a/test/CodeGen/X86/jump_table_bitcast.ll +++ b/test/CodeGen/X86/jump_table_bitcast.ll @@ -15,12 +15,12 @@ define void @h(void ()* %func) unnamed_addr jumptable { define i32 @main() { %g = alloca i32 (...)*, align 8 store i32 (...)* bitcast (i32 ()* @f to i32 (...)*), i32 (...)** %g, align 8 -; CHECK: movq $__llvm_jump_instr_table_0_[[ENTRY:1|2|3]], (%rsp) -; CHECK: movl $__llvm_jump_instr_table_0_[[ENTRY]], %ecx +; CHECK: movq $__llvm_jump_instr_table_0_[[ENTRY:1|2|3]], +; CHECK: movl $__llvm_jump_instr_table_0_[[ENTRY]], %1 = load i32 (...)** %g, align 8 %call = call i32 (...)* %1() call void (void ()*)* @h(void ()* bitcast (void (void ()*)* @h to void ()*)) -; CHECK: movl $__llvm_jump_instr_table_0_{{1|2|3}}, %edi +; CHECK: movl $__llvm_jump_instr_table_0_{{1|2|3}}, ; CHECK: callq h %a = call i32 (i32*)* bitcast (i32 (i8*)* @g to i32(i32*)*)(i32* null) @@ -28,17 +28,14 @@ define i32 @main() { ret i32 %a } -; CHECK: .globl __llvm_jump_instr_table_0_1 ; CHECK: .align 8, 0x90 ; CHECK: .type __llvm_jump_instr_table_0_1,@function ; CHECK: __llvm_jump_instr_table_0_1: ; CHECK: jmp {{f|g|h}}@PLT -; CHECK: .globl __llvm_jump_instr_table_0_2 ; CHECK: .align 8, 0x90 ; CHECK: .type __llvm_jump_instr_table_0_2,@function ; CHECK: __llvm_jump_instr_table_0_2: ; CHECK: jmp {{f|g|h}}@PLT -; CHECK: .globl __llvm_jump_instr_table_0_3 ; CHECK: .align 8, 0x90 ; CHECK: .type __llvm_jump_instr_table_0_3,@function ; CHECK: __llvm_jump_instr_table_0_3: diff --git a/test/CodeGen/X86/jump_tables.ll b/test/CodeGen/X86/jump_tables.ll index 5a0aed0c1761..485154eaa2a9 100644 --- a/test/CodeGen/X86/jump_tables.ll +++ b/test/CodeGen/X86/jump_tables.ll @@ -7,6 +7,20 @@ target triple = "x86_64-unknown-linux-gnu" %struct.fun_struct = type { i32 (...)* } +@a = global [12 x i32 () *] [ i32 ()* bitcast (void ()* @indirect_fun to i32 ()*), + i32 ()* bitcast (void ()* @indirect_fun_match to i32 ()*), + i32 ()* bitcast (i32 ()* @indirect_fun_i32 to i32 ()*), + i32 ()* bitcast (i32 (i32)* @indirect_fun_i32_1 to i32 ()*), + i32 ()* bitcast (i32 (i32, i32)* @indirect_fun_i32_2 to i32 ()*), + i32 ()* bitcast (i32* (i32*, i32)* @indirect_fun_i32S_2 to i32 ()*), + i32 ()* bitcast (void (%struct.fun_struct)* @indirect_fun_struct to i32 ()*), + i32 ()* bitcast (void (i32 (...)*, i32)* @indirect_fun_fun to i32 ()*), + i32 ()* bitcast (i32 (i32 (...)*, i32)* @indirect_fun_fun_ret to i32 ()*), + i32 ()* bitcast (void ([19 x i8])* @indirect_fun_array to i32 ()*), + i32 ()* bitcast (void (<3 x i32>)* @indirect_fun_vec to i32 ()*), + i32 ()* bitcast (void (<4 x float>)* @indirect_fun_vec_2 to i32 ()*) + ] + define void @indirect_fun() unnamed_addr jumptable { ret void } @@ -74,62 +88,50 @@ define i32 @main(i32 %argc, i8** %argv) { ret i32 %a } -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_1 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_1,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_1: ; SINGLE-DAG: jmp indirect_fun_array@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_2 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_2,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_2: ; SINGLE-DAG: jmp indirect_fun_i32_2@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_3 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_3,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_3: ; SINGLE-DAG: jmp indirect_fun_vec_2@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_4 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_4,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_4: ; SINGLE-DAG: jmp indirect_fun_i32S_2@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_5 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_5,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_5: ; SINGLE-DAG: jmp indirect_fun_struct@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_6 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_6,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_6: ; SINGLE-DAG: jmp indirect_fun_i32_1@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_7 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_7,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_7: ; SINGLE-DAG: jmp indirect_fun_i32@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_8 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_8,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_8: ; SINGLE-DAG: jmp indirect_fun_fun@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_9 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_9,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_9: ; SINGLE-DAG: jmp indirect_fun_fun_ret@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_10 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_10,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_10: ; SINGLE-DAG: jmp indirect_fun@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_11 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_11,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_11: ; SINGLE-DAG: jmp indirect_fun_match@PLT -; SINGLE-DAG: .globl __llvm_jump_instr_table_0_12 ; SINGLE-DAG: .align 8, 0x90 ; SINGLE-DAG: .type __llvm_jump_instr_table_0_12,@function ; SINGLE-DAG: __llvm_jump_instr_table_0_12: @@ -144,82 +146,69 @@ define i32 @main(i32 %argc, i8** %argv) { ; SINGLE-DAG: ud2 -; ARITY-DAG: .globl __llvm_jump_instr_table_2_1 ; ARITY-DAG: .align 8, 0x90 ; ARITY-DAG: .type __llvm_jump_instr_table_2_1,@function ; ARITY-DAG: __llvm_jump_instr_table_2_1: ; ARITY-DAG: jmp indirect_fun{{.*}}@PLT ; ARITY-DAG: .align 8, 0x90 ; ARITY-DAG: ud2 -; ARITY-DAG: .globl __llvm_jump_instr_table_0_1 ; ARITY-DAG: .align 8, 0x90 ; ARITY-DAG: .type __llvm_jump_instr_table_0_1,@function ; ARITY-DAG: __llvm_jump_instr_table_0_1: ; ARITY-DAG: jmp indirect_fun{{.*}}@PLT -; ARITY-DAG: .globl __llvm_jump_instr_table_1_1 ; ARITY-DAG: .align 8, 0x90 ; ARITY-DAG: .type __llvm_jump_instr_table_1_1,@function ; ARITY-DAG: __llvm_jump_instr_table_1_1: ; ARITY-DAG: jmp indirect_fun{{.*}}@PLT -; SIMPL-DAG: .globl __llvm_jump_instr_table_2_1 ; SIMPL-DAG: .align 8, 0x90 ; SIMPL-DAG: .type __llvm_jump_instr_table_2_1,@function ; SIMPL-DAG: __llvm_jump_instr_table_2_1: ; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT ; SIMPL-DAG: .align 8, 0x90 ; SIMPL-DAG: ud2 -; SIMPL-DAG: .globl __llvm_jump_instr_table_0_1 ; SIMPL-DAG: .align 8, 0x90 ; SIMPL-DAG: .type __llvm_jump_instr_table_0_1,@function ; SIMPL-DAG: __llvm_jump_instr_table_0_1: ; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT -; SIMPL-DAG: .globl __llvm_jump_instr_table_1_1 ; SIMPL-DAG: .align 8, 0x90 ; SIMPL-DAG: .type __llvm_jump_instr_table_1_1,@function ; SIMPL-DAG: __llvm_jump_instr_table_1_1: ; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT -; SIMPL-DAG: .globl __llvm_jump_instr_table_3_1 ; SIMPL-DAG: .align 8, 0x90 ; SIMPL-DAG: .type __llvm_jump_instr_table_3_1,@function ; SIMPL-DAG: __llvm_jump_instr_table_3_1: ; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT -; SIMPL-DAG: .globl __llvm_jump_instr_table_4_1 ; SIMPL-DAG: .align 8, 0x90 ; SIMPL-DAG: .type __llvm_jump_instr_table_4_1,@function ; SIMPL-DAG: __llvm_jump_instr_table_4_1: ; SIMPL-DAG: jmp indirect_fun{{.*}}@PLT -; FULL-DAG: .globl __llvm_jump_instr_table_10_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_10_1,@function ; FULL-DAG:__llvm_jump_instr_table_10_1: ; FULL-DAG: jmp indirect_fun_i32_1@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_9_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_9_1,@function ; FULL-DAG:__llvm_jump_instr_table_9_1: ; FULL-DAG: jmp indirect_fun_i32_2@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_7_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_7_1,@function ; FULL-DAG:__llvm_jump_instr_table_7_1: ; FULL-DAG: jmp indirect_fun_i32S_2@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_3_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_3_1,@function ; FULL-DAG:__llvm_jump_instr_table_3_1: ; FULL-DAG: jmp indirect_fun_vec_2@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_2_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_2_1,@function ; FULL-DAG:__llvm_jump_instr_table_2_1: @@ -228,42 +217,36 @@ define i32 @main(i32 %argc, i8** %argv) { ; FULL-DAG: ud2 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_8_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_8_1,@function ; FULL-DAG:__llvm_jump_instr_table_8_1: ; FULL-DAG: jmp indirect_fun_i32@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_1_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_1_1,@function ; FULL-DAG:__llvm_jump_instr_table_1_1: ; FULL-DAG: jmp indirect_fun_array@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_0_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_0_1,@function ; FULL-DAG:__llvm_jump_instr_table_0_1: ; FULL-DAG: jmp indirect_fun_vec@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_6_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_6_1,@function ; FULL-DAG:__llvm_jump_instr_table_6_1: ; FULL-DAG: jmp indirect_fun_struct@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_5_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_5_1,@function ; FULL-DAG:__llvm_jump_instr_table_5_1: ; FULL-DAG: jmp indirect_fun_fun@PLT ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: ud2 -; FULL-DAG: .globl __llvm_jump_instr_table_4_1 ; FULL-DAG: .align 8, 0x90 ; FULL-DAG: .type __llvm_jump_instr_table_4_1,@function ; FULL-DAG:__llvm_jump_instr_table_4_1: diff --git a/test/CodeGen/X86/large-code-model-isel.ll b/test/CodeGen/X86/large-code-model-isel.ll new file mode 100644 index 000000000000..3c283d934949 --- /dev/null +++ b/test/CodeGen/X86/large-code-model-isel.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -code-model=large -mcpu=core2 -march=x86-64 -O0 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +@.str10 = external unnamed_addr constant [2 x i8], align 1 + +define void @foo() { +; CHECK-LABEL: foo: +entry: +; CHECK: callq + %call = call i64* undef(i64* undef, i8* getelementptr inbounds ([2 x i8]* @.str10, i32 0, i32 0)) + ret void +} diff --git a/test/CodeGen/X86/lea-2.ll b/test/CodeGen/X86/lea-2.ll index 82cefb728c6e..98c57c7d090d 100644 --- a/test/CodeGen/X86/lea-2.ll +++ b/test/CodeGen/X86/lea-2.ll @@ -1,4 +1,7 @@ -; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | FileCheck %s +; RUN: llc < %s -mtriple=i686-linux -x86-asm-syntax=intel | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux -x86-asm-syntax=intel | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -x86-asm-syntax=intel | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-nacl -x86-asm-syntax=intel | FileCheck %s define i32 @test1(i32 %A, i32 %B) { %tmp1 = shl i32 %A, 2 @@ -7,7 +10,7 @@ define i32 @test1(i32 %A, i32 %B) { ; The above computation of %tmp4 should match a single lea, without using ; actual add instructions. ; CHECK-NOT: add -; CHECK: lea {{[a-z]+}}, dword ptr [{{[a-z]+}} + 4*{{[a-z]+}} - 5] +; CHECK: lea {{[a-z]+}}, [{{[a-z]+}} + 4*{{[a-z]+}} - 5] ret i32 %tmp4 } diff --git a/test/CodeGen/X86/lea-3.ll b/test/CodeGen/X86/lea-3.ll index c439ee1d06e3..a56403a24b03 100644 --- a/test/CodeGen/X86/lea-3.ll +++ b/test/CodeGen/X86/lea-3.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s ; CHECK: leaq (,[[A0:%rdi|%rcx]],4), %rax diff --git a/test/CodeGen/X86/lea-4.ll b/test/CodeGen/X86/lea-4.ll index cef47264a583..00c2278c54bf 100644 --- a/test/CodeGen/X86/lea-4.ll +++ b/test/CodeGen/X86/lea-4.ll @@ -1,4 +1,7 @@ -; RUN: llc < %s -march=x86-64 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s + define zeroext i16 @t1(i32 %on_off) nounwind { entry: diff --git a/test/CodeGen/X86/lea-5.ll b/test/CodeGen/X86/lea-5.ll new file mode 100644 index 000000000000..50d3aaf4c594 --- /dev/null +++ b/test/CodeGen/X86/lea-5.ll @@ -0,0 +1,59 @@ +; test for more complicated forms of lea operands which can be generated +; in loop optimized cases. +; See also http://llvm.org/bugs/show_bug.cgi?id=20016 + +; RUN: llc < %s -mtriple=x86_64-linux -O2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -O2 | FileCheck %s -check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-nacl -O2 | FileCheck %s -check-prefix=X32 + +; Function Attrs: nounwind readnone uwtable +define void @foo(i32 %x, i32 %d) #0 { +entry: + %a = alloca [8 x i32], align 16 + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %d.addr.0 = phi i32 [ %d, %entry ], [ %inc, %while.cond ] + %arrayidx = getelementptr inbounds [8 x i32]* %a, i32 0, i32 %d.addr.0 + +; CHECK: leaq -40(%rsp,%r{{[^,]*}},4), %rax +; X32: leal -40(%rsp,%r{{[^,]*}},4), %eax + %0 = load i32* %arrayidx, align 4 + %cmp1 = icmp eq i32 %0, 0 + %inc = add nsw i32 %d.addr.0, 1 + +; CHECK: leaq 4(%r{{[^,]*}}), %r{{[^,]*}} +; X32: leal 4(%r{{[^,]*}}), %e{{[^,]*}} + br i1 %cmp1, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret void +} + +; The same test as above but with enforsed stack realignment (%a aligned by 64) +; to check one more case of correct lea generation. + +; Function Attrs: nounwind readnone uwtable +define void @bar(i32 %x, i32 %d) #0 { +entry: + %a = alloca [8 x i32], align 64 + br label %while.cond + +while.cond: ; preds = %while.cond, %entry + %d.addr.0 = phi i32 [ %d, %entry ], [ %inc, %while.cond ] + %arrayidx = getelementptr inbounds [8 x i32]* %a, i32 0, i32 %d.addr.0 + +; CHECK: leaq (%rsp,%r{{[^,]*}},4), %rax +; X32: leal (%rsp,%r{{[^,]*}},4), %eax + %0 = load i32* %arrayidx, align 4 + %cmp1 = icmp eq i32 %0, 0 + %inc = add nsw i32 %d.addr.0, 1 + +; CHECK: leaq 4(%r{{[^,]*}}), %r{{[^,]*}} +; X32: leal 4(%r{{[^,]*}}), %e{{[^,]*}} + br i1 %cmp1, label %while.end, label %while.cond + +while.end: ; preds = %while.cond + ret void +} + diff --git a/test/CodeGen/X86/lea.ll b/test/CodeGen/X86/lea.ll index 93cfe4611b44..9b6632c94693 100644 --- a/test/CodeGen/X86/lea.ll +++ b/test/CodeGen/X86/lea.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s define i32 @test1(i32 %x) nounwind { %tmp1 = shl i32 %x, 3 diff --git a/test/CodeGen/X86/long-extend.ll b/test/CodeGen/X86/long-extend.ll deleted file mode 100644 index 5bbd41dad9d2..000000000000 --- a/test/CodeGen/X86/long-extend.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc < %s -mcpu=core-avx-i -mtriple=x86_64-linux -asm-verbose=0| FileCheck %s -define void @test_long_extend(<16 x i8> %a, <16 x i32>* %p) nounwind { -; CHECK-LABEL: test_long_extend -; CHECK: vpunpcklbw %xmm1, %xmm0, [[REG1:%xmm[0-9]+]] -; CHECK: vpunpckhwd %xmm1, [[REG1]], [[REG2:%xmm[0-9]+]] -; CHECK: vpunpcklwd %xmm1, [[REG1]], %x[[REG3:mm[0-9]+]] -; CHECK: vinsertf128 $1, [[REG2]], %y[[REG3]], [[REG_result0:%ymm[0-9]+]] -; CHECK: vpunpckhbw %xmm1, %xmm0, [[REG4:%xmm[0-9]+]] -; CHECK: vpunpckhwd %xmm1, [[REG4]], [[REG5:%xmm[0-9]+]] -; CHECK: vpunpcklwd %xmm1, [[REG4]], %x[[REG6:mm[0-9]+]] -; CHECK: vinsertf128 $1, [[REG5]], %y[[REG6]], [[REG_result1:%ymm[0-9]+]] -; CHECK: vmovaps [[REG_result1]], 32(%rdi) -; CHECK: vmovaps [[REG_result0]], (%rdi) - - %tmp = zext <16 x i8> %a to <16 x i32> - store <16 x i32> %tmp, <16 x i32>*%p - ret void -} diff --git a/test/CodeGen/X86/loop-strength-reduce8.ll b/test/CodeGen/X86/loop-strength-reduce8.ll index 1d042769b0ba..c36047c451ae 100644 --- a/test/CodeGen/X86/loop-strength-reduce8.ll +++ b/test/CodeGen/X86/loop-strength-reduce8.ll @@ -1,6 +1,9 @@ ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s -; CHECK: leal 16(%eax), %edx +; FIXME: The first two instructions, movl and addl, should have been combined to +; "leal 16(%eax), %edx" by the backend (PR20776). +; CHECK: movl %eax, %edx +; CHECK: addl $16, %edx ; CHECK: align ; CHECK: addl $4, %edx ; CHECK: decl %ecx diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll index f47161e5520c..edb8433ec30c 100644 --- a/test/CodeGen/X86/lower-bitcast.ll +++ b/test/CodeGen/X86/lower-bitcast.ll @@ -68,13 +68,13 @@ define i64 @test4(i64 %A) { %2 = bitcast <2 x i32> %add to i64 ret i64 %2 } -; FIXME: At the moment we still produce the sequence pshufd+paddq+pshufd. +; FIXME: At the moment we still produce the sequence pshufd+paddd+pshufd. ; Ideally, we should fold that sequence into a single paddd. This is fixed with ; the widening legalization. ; ; CHECK-LABEL: test4 ; CHECK: pshufd -; CHECK-NEXT: paddq +; CHECK-NEXT: paddd ; CHECK-NEXT: pshufd ; CHECK: ret ; diff --git a/test/CodeGen/X86/lower-vec-shift-2.ll b/test/CodeGen/X86/lower-vec-shift-2.ll new file mode 100644 index 000000000000..770775d32427 --- /dev/null +++ b/test/CodeGen/X86/lower-vec-shift-2.ll @@ -0,0 +1,149 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s -check-prefix=SSE2 +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s -check-prefix=AVX + +define <8 x i16> @test1(<8 x i16> %A, <8 x i16> %B) { +; SSE2-LABEL: test1: +; SSE2: # BB#0 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: psllw %xmm1, %xmm0 +; SSE2-NEXT: retq +; AVX-LABEL: test1: +; AVX: # BB#0 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer + %shl = shl <8 x i16> %A, %vecinit14 + ret <8 x i16> %shl +} + +define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) { +; SSE2-LABEL: test2: +; SSE2: # BB#0 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss %xmm1, %xmm2 +; SSE2-NEXT: pslld %xmm2, %xmm0 +; SSE2-NEXT: retq +; AVX-LABEL: test2: +; AVX: # BB#0 +; AVX-NEXT: vpxor %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer + %shl = shl <4 x i32> %A, %vecinit6 + ret <4 x i32> %shl +} + +define <2 x i64> @test3(<2 x i64> %A, <2 x i64> %B) { +; SSE2-LABEL: test3: +; SSE2: # BB#0 +; SSE2-NEXT: psllq %xmm1, %xmm0 +; SSE2-NEXT: retq +; AVX-LABEL: test3: +; AVX: # BB#0 +; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %vecinit2 = shufflevector <2 x i64> %B, <2 x i64> undef, <2 x i32> zeroinitializer + %shl = shl <2 x i64> %A, %vecinit2 + ret <2 x i64> %shl +} + +define <8 x i16> @test4(<8 x i16> %A, <8 x i16> %B) { +; SSE2-LABEL: test4: +; SSE2: # BB#0 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: retq +; AVX-LABEL: test4: +; AVX: # BB#0 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer + %shr = lshr <8 x i16> %A, %vecinit14 + ret <8 x i16> %shr +} + +define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) { +; SSE2-LABEL: test5: +; SSE2: # BB#0 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss %xmm1, %xmm2 +; SSE2-NEXT: psrld %xmm2, %xmm0 +; SSE2-NEXT: retq +; AVX-LABEL: test5: +; AVX: # BB#0 +; AVX-NEXT: vpxor %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer + %shr = lshr <4 x i32> %A, %vecinit6 + ret <4 x i32> %shr +} + +define <2 x i64> @test6(<2 x i64> %A, <2 x i64> %B) { +; SSE2-LABEL: test6: +; SSE2: # BB#0 +; SSE2-NEXT: psrlq %xmm1, %xmm0 +; SSE2-NEXT: retq +; AVX-LABEL: test6: +; AVX: # BB#0 +; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %vecinit2 = shufflevector <2 x i64> %B, <2 x i64> undef, <2 x i32> zeroinitializer + %shr = lshr <2 x i64> %A, %vecinit2 + ret <2 x i64> %shr +} + +define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) { +; SSE2-LABEL: test7: +; SSE2: # BB#0 +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: psraw %xmm1, %xmm0 +; SSE2-NEXT: retq +; AVX-LABEL: test7: +; AVX: # BB#0 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer + %shr = ashr <8 x i16> %A, %vecinit14 + ret <8 x i16> %shr +} + +define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) { +; SSE2-LABEL: test8: +; SSE2: # BB#0 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss %xmm1, %xmm2 +; SSE2-NEXT: psrad %xmm2, %xmm0 +; SSE2-NEXT: retq +; AVX-LABEL: test8: +; AVX: # BB#0 +; AVX-NEXT: vpxor %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer + %shr = ashr <4 x i32> %A, %vecinit6 + ret <4 x i32> %shr +} diff --git a/test/CodeGen/X86/lzcnt-tzcnt.ll b/test/CodeGen/X86/lzcnt-tzcnt.ll index 07e4b9d8ce61..e98764a0d787 100644 --- a/test/CodeGen/X86/lzcnt-tzcnt.ll +++ b/test/CodeGen/X86/lzcnt-tzcnt.ll @@ -437,6 +437,137 @@ define i64 @test18_cttz(i64* %ptr) { ; CHECK: tzcnt ; CHECK-NEXT: ret +define i16 @test1b_ctlz(i16 %v) { + %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true) + %tobool = icmp ne i16 %v, 0 + %cond = select i1 %tobool, i16 16, i16 %cnt + ret i16 %cond +} +; CHECK-LABEL: test1b_ctlz +; CHECK: lzcnt +; CHECK-NEXT: ret + + +define i32 @test2b_ctlz(i32 %v) { + %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true) + %tobool = icmp ne i32 %v, 0 + %cond = select i1 %tobool, i32 32, i32 %cnt + ret i32 %cond +} +; CHECK-LABEL: test2b_ctlz +; CHECK: lzcnt +; CHECK-NEXT: ret + + +define i64 @test3b_ctlz(i64 %v) { + %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true) + %tobool = icmp ne i64 %v, 0 + %cond = select i1 %tobool, i64 64, i64 %cnt + ret i64 %cond +} +; CHECK-LABEL: test3b_ctlz +; CHECK: lzcnt +; CHECK-NEXT: ret + + +define i16 @test4b_ctlz(i16 %v) { + %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true) + %tobool = icmp ne i16 %v, 0 + %cond = select i1 %tobool, i16 %cnt, i16 16 + ret i16 %cond +} +; CHECK-LABEL: test4b_ctlz +; CHECK: lzcnt +; CHECK-NEXT: ret + + +define i32 @test5b_ctlz(i32 %v) { + %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true) + %tobool = icmp ne i32 %v, 0 + %cond = select i1 %tobool, i32 %cnt, i32 32 + ret i32 %cond +} +; CHECK-LABEL: test5b_ctlz +; CHECK: lzcnt +; CHECK-NEXT: ret + + +define i64 @test6b_ctlz(i64 %v) { + %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true) + %tobool = icmp ne i64 %v, 0 + %cond = select i1 %tobool, i64 %cnt, i64 64 + ret i64 %cond +} +; CHECK-LABEL: test6b_ctlz +; CHECK: lzcnt +; CHECK-NEXT: ret + + +define i16 @test1b_cttz(i16 %v) { + %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true) + %tobool = icmp ne i16 %v, 0 + %cond = select i1 %tobool, i16 16, i16 %cnt + ret i16 %cond +} +; CHECK-LABEL: test1b_cttz +; CHECK: tzcnt +; CHECK-NEXT: ret + + +define i32 @test2b_cttz(i32 %v) { + %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true) + %tobool = icmp ne i32 %v, 0 + %cond = select i1 %tobool, i32 32, i32 %cnt + ret i32 %cond +} +; CHECK-LABEL: test2b_cttz +; CHECK: tzcnt +; CHECK-NEXT: ret + + +define i64 @test3b_cttz(i64 %v) { + %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true) + %tobool = icmp ne i64 %v, 0 + %cond = select i1 %tobool, i64 64, i64 %cnt + ret i64 %cond +} +; CHECK-LABEL: test3b_cttz +; CHECK: tzcnt +; CHECK-NEXT: ret + + +define i16 @test4b_cttz(i16 %v) { + %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true) + %tobool = icmp ne i16 %v, 0 + %cond = select i1 %tobool, i16 %cnt, i16 16 + ret i16 %cond +} +; CHECK-LABEL: test4b_cttz +; CHECK: tzcnt +; CHECK-NEXT: ret + + +define i32 @test5b_cttz(i32 %v) { + %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true) + %tobool = icmp ne i32 %v, 0 + %cond = select i1 %tobool, i32 %cnt, i32 32 + ret i32 %cond +} +; CHECK-LABEL: test5b_cttz +; CHECK: tzcnt +; CHECK-NEXT: ret + + +define i64 @test6b_cttz(i64 %v) { + %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true) + %tobool = icmp ne i64 %v, 0 + %cond = select i1 %tobool, i64 %cnt, i64 64 + ret i64 %cond +} +; CHECK-LABEL: test6b_cttz +; CHECK: tzcnt +; CHECK-NEXT: ret + declare i64 @llvm.cttz.i64(i64, i1) declare i32 @llvm.cttz.i32(i32, i1) diff --git a/test/CodeGen/X86/macho-comdat.ll b/test/CodeGen/X86/macho-comdat.ll index 3c2d997b4594..60560470ed5b 100644 --- a/test/CodeGen/X86/macho-comdat.ll +++ b/test/CodeGen/X86/macho-comdat.ll @@ -2,5 +2,5 @@ ; RUN: FileCheck < %t %s $f = comdat any -@v = global i32 0, comdat $f +@v = global i32 0, comdat($f) ; CHECK: LLVM ERROR: MachO doesn't support COMDATs, 'f' cannot be lowered. diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll new file mode 100644 index 000000000000..726d7125a638 --- /dev/null +++ b/test/CodeGen/X86/masked_memop.ll @@ -0,0 +1,217 @@ +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512 +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2 +; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=AVX_SCALAR + +; AVX512-LABEL: test1 +; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z} + +; AVX2-LABEL: test1 +; AVX2: vpmaskmovd 32(%rdi) +; AVX2: vpmaskmovd (%rdi) +; AVX2-NOT: blend + +; AVX_SCALAR-LABEL: test1 +; AVX_SCALAR-NOT: masked +; AVX_SCALAR: extractelement +; AVX_SCALAR: insertelement +; AVX_SCALAR: extractelement +; AVX_SCALAR: insertelement +define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) { + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef) + ret <16 x i32> %res +} + +; AVX512-LABEL: test2 +; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z} + +; AVX2-LABEL: test2 +; AVX2: vpmaskmovd {{.*}}(%rdi) +; AVX2: vpmaskmovd {{.*}}(%rdi) +; AVX2-NOT: blend +define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) { + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer) + ret <16 x i32> %res +} + +; AVX512-LABEL: test3 +; AVX512: vmovdqu32 %zmm1, (%rdi) {%k1} + +; AVX_SCALAR-LABEL: test3 +; AVX_SCALAR-NOT: masked +; AVX_SCALAR: extractelement +; AVX_SCALAR: store +; AVX_SCALAR: extractelement +; AVX_SCALAR: store +; AVX_SCALAR: extractelement +; AVX_SCALAR: store +define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) { + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask) + ret void +} + +; AVX512-LABEL: test4 +; AVX512: vmovups (%rdi), %zmm{{.*{%k[1-7]}}} + +; AVX2-LABEL: test4 +; AVX2: vmaskmovps {{.*}}(%rdi) +; AVX2: vmaskmovps {{.*}}(%rdi) +; AVX2: blend +define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) { + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + %res = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst) + ret <16 x float> %res +} + +; AVX512-LABEL: test5 +; AVX512: vmovupd (%rdi), %zmm1 {%k1} + +; AVX2-LABEL: test5 +; AVX2: vmaskmovpd +; AVX2: vblendvpd +; AVX2: vmaskmovpd +; AVX2: vblendvpd +define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) { + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst) + ret <8 x double> %res +} + +; AVX2-LABEL: test6 +; AVX2: vmaskmovpd +; AVX2: vblendvpd +define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) { + %mask = icmp eq <2 x i64> %trigger, zeroinitializer + %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) + ret <2 x double> %res +} + +; AVX2-LABEL: test7 +; AVX2: vmaskmovps {{.*}}(%rdi) +; AVX2: blend +define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) { + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst) + ret <4 x float> %res +} + +; AVX2-LABEL: test8 +; AVX2: vpmaskmovd {{.*}}(%rdi) +; AVX2: blend +define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) { + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) + ret <4 x i32> %res +} + +; AVX2-LABEL: test9 +; AVX2: vpmaskmovd %xmm +define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) + ret void +} + +; AVX2-LABEL: test10 +; AVX2: vmaskmovpd (%rdi), %ymm +; AVX2: blend +define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) { + %mask = icmp eq <4 x i32> %trigger, zeroinitializer + %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 4, <4 x i1>%mask, <4 x double>%dst) + ret <4 x double> %res +} + +; AVX2-LABEL: test11 +; AVX2: vmaskmovps +; AVX2: vblendvps +define <8 x float> @test11(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) { + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst) + ret <8 x float> %res +} + +; AVX2-LABEL: test12 +; AVX2: vpmaskmovd %ymm +define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) { + %mask = icmp eq <8 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask) + ret void +} + +; AVX512-LABEL: test13 +; AVX512: vmovups %zmm1, (%rdi) {%k1} + +define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) { + %mask = icmp eq <16 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask) + ret void +} + +; AVX2-LABEL: test14 +; AVX2: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX2: vmaskmovps +define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) + ret void +} + +; AVX2-LABEL: test15 +; AVX2: vpmaskmovq +define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) + ret void +} + +; AVX2-LABEL: test16 +; AVX2: vmaskmovps +; AVX2: vblendvps +define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) + ret <2 x float> %res +} + +; AVX2-LABEL: test17 +; AVX2: vpmaskmovq +; AVX2: vblendvpd +define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) + ret <2 x i32> %res +} + +; AVX2-LABEL: test18 +; AVX2: vmaskmovps +; AVX2-NOT: blend +define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) { + %mask = icmp eq <2 x i32> %trigger, zeroinitializer + %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef) + ret <2 x float> %res +} + + +declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) +declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) +declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) +declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) +declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) +declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>) +declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) +declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) +declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) +declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) +declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) +declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) +declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) +declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) +declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) + diff --git a/test/CodeGen/X86/mem-intrin-base-reg.ll b/test/CodeGen/X86/mem-intrin-base-reg.ll new file mode 100644 index 000000000000..9a6de3dd1d92 --- /dev/null +++ b/test/CodeGen/X86/mem-intrin-base-reg.ll @@ -0,0 +1,100 @@ +; RUN: llc -mtriple=i686-windows -mattr=+sse2 < %s | FileCheck %s + +target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" +target triple = "i686-pc-windows-msvc" + +; There is a conflict between lowering the X86 memory intrinsics and the "base" +; register used to address stack locals. See X86RegisterInfo::hasBaseRegister +; for when this is necessary. Typically, we chose ESI for the base register, +; which all of the X86 string instructions use. + +; The pattern of vector icmp and extractelement is used in these tests because +; it forces creation of an aligned stack temporary. Perhaps such temporaries +; shouldn't be aligned. + +declare void @escape_vla_and_icmp(i8*, i1 zeroext) +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) +declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) + +define i32 @memcpy_novla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) { + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false) + br i1 %cond, label %spill_vectors, label %no_vectors + +no_vectors: + ret i32 0 + +spill_vectors: + %vp1 = getelementptr <4 x i32>* %vp0, i32 1 + %v0 = load <4 x i32>* %vp0 + %v1 = load <4 x i32>* %vp1 + %vicmp = icmp slt <4 x i32> %v0, %v1 + %icmp = extractelement <4 x i1> %vicmp, i32 0 + call void @escape_vla_and_icmp(i8* null, i1 zeroext %icmp) + %r = extractelement <4 x i32> %v0, i32 0 + ret i32 %r +} + +; CHECK-LABEL: _memcpy_novla_vector: +; CHECK: andl $-16, %esp +; CHECK-DAG: movl $32, %ecx +; CHECK-DAG: movl {{.*}}, %esi +; CHECK-DAG: movl {{.*}}, %edi +; CHECK: rep;movsl + +define i32 @memcpy_vla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) { + call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false) + br i1 %cond, label %spill_vectors, label %no_vectors + +no_vectors: + ret i32 0 + +spill_vectors: + %vp1 = getelementptr <4 x i32>* %vp0, i32 1 + %v0 = load <4 x i32>* %vp0 + %v1 = load <4 x i32>* %vp1 + %vicmp = icmp slt <4 x i32> %v0, %v1 + %icmp = extractelement <4 x i1> %vicmp, i32 0 + %vla = alloca i8, i32 %n + call void @escape_vla_and_icmp(i8* %vla, i1 zeroext %icmp) + %r = extractelement <4 x i32> %v0, i32 0 + ret i32 %r +} + +; CHECK-LABEL: _memcpy_vla_vector: +; CHECK: andl $-16, %esp +; CHECK: movl %esp, %esi +; CHECK: pushl $128 +; CHECK: calll _memcpy +; CHECK: calll __chkstk + +; stosd doesn't clobber esi, so we can use it. + +define i32 @memset_vla_vector(<4 x i32>* %vp0, i8* %a, i32 %n, i1 zeroext %cond) { + call void @llvm.memset.p0i8.i32(i8* %a, i8 42, i32 128, i32 4, i1 false) + br i1 %cond, label %spill_vectors, label %no_vectors + +no_vectors: + ret i32 0 + +spill_vectors: + %vp1 = getelementptr <4 x i32>* %vp0, i32 1 + %v0 = load <4 x i32>* %vp0 + %v1 = load <4 x i32>* %vp1 + %vicmp = icmp slt <4 x i32> %v0, %v1 + %icmp = extractelement <4 x i1> %vicmp, i32 0 + %vla = alloca i8, i32 %n + call void @escape_vla_and_icmp(i8* %vla, i1 zeroext %icmp) + %r = extractelement <4 x i32> %v0, i32 0 + ret i32 %r +} + +; CHECK-LABEL: _memset_vla_vector: +; CHECK: andl $-16, %esp +; CHECK: movl %esp, %esi +; CHECK-DAG: movl $707406378, %eax # imm = 0x2A2A2A2A +; CHECK-DAG: movl $32, %ecx +; CHECK-DAG: movl {{.*}}, %edi +; CHECK-NOT: movl {{.*}}, %esi +; CHECK: rep;stosl + +; Add a test for memcmp if we ever add a special lowering for it. diff --git a/test/CodeGen/X86/mem-promote-integers.ll b/test/CodeGen/X86/mem-promote-integers.ll index 0015df0c1fac..ea38b95a864e 100644 --- a/test/CodeGen/X86/mem-promote-integers.ll +++ b/test/CodeGen/X86/mem-promote-integers.ll @@ -1,8 +1,8 @@ ; Test the basic functionality of integer element promotions of different types. ; This tests checks passing of arguments, loading and storing to memory and ; basic arithmetic. -; RUN: llc -march=x86 < %s -; RUN: llc -march=x86-64 < %s +; RUN: llc -march=x86 < %s > /dev/null +; RUN: llc -march=x86-64 < %s > /dev/null define <1 x i8> @test_1xi8(<1 x i8> %x, <1 x i8>* %b) { %bb = load <1 x i8>* %b diff --git a/test/CodeGen/X86/misched-code-difference-with-debug.ll b/test/CodeGen/X86/misched-code-difference-with-debug.ll new file mode 100644 index 000000000000..fb2a986e561b --- /dev/null +++ b/test/CodeGen/X86/misched-code-difference-with-debug.ll @@ -0,0 +1,90 @@ +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-unknown -mcpu=generic | FileCheck %s +; Both functions should produce the same code. The presence of debug values +; should not affect the scheduling strategy. +; Generated from: +; char argc; +; class C { +; public: +; int test(char ,char ,char ,...); +; }; +; void foo() { +; C c; +; char lc = argc; +; c.test(0,argc,0,lc); +; c.test(0,argc,0,lc); +; } +; +; with +; clang -O2 -c test.cpp -emit-llvm -S +; clang -O2 -c test.cpp -emit-llvm -S -g +; + + +%class.C = type { i8 } + +@argc = global i8 0, align 1 + +declare i32 @test_function(%class.C*, i8 signext, i8 signext, i8 signext, ...) + +; CHECK-LABEL: test_without_debug +; CHECK: movl [[A:%[a-z]+]], [[B:%[a-z]+]] +; CHECK-NEXT: movl [[A]], [[C:%[a-z]+]] +define void @test_without_debug() { +entry: + %c = alloca %class.C, align 1 + %0 = load i8* @argc, align 1 + %conv = sext i8 %0 to i32 + %call = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv) + %1 = load i8* @argc, align 1 + %call2 = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv) + ret void +} + +; CHECK-LABEL: test_with_debug +; CHECK: movl [[A]], [[B]] +; CHECK-NEXT: movl [[A]], [[C]] +define void @test_with_debug() { +entry: + %c = alloca %class.C, align 1 + %0 = load i8* @argc, align 1 + tail call void @llvm.dbg.value(metadata i8 %0, i64 0, metadata !19, metadata !29) + %conv = sext i8 %0 to i32 + tail call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29) + %call = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %0, i8 signext 0, i32 %conv) + %1 = load i8* @argc, align 1 + call void @llvm.dbg.value(metadata %class.C* %c, i64 0, metadata !18, metadata !29) + %call2 = call i32 (%class.C*, i8, i8, i8, ...)* @test_function(%class.C* %c, i8 signext 0, i8 signext %1, i8 signext 0, i32 %conv) + ret void +} + +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!22, !23} + +!0 = !{!"", !1, !2, !3, !12, !20, !2} ; [ DW_TAG_compile_unit ] [test.cpp] [DW_LANG_C_plus_plus] +!1 = !{!"test.cpp", !""} +!2 = !{} +!3 = !{!4} +!4 = !{!"0x2\00C\002\008\008\000\000\000", !1, null, null, !5, null, null, !"_ZTS1C"} ; [ DW_TAG_class_type ] [C] [line 2, size 8, align 8, offset 0] [def] [from ] +!5 = !{!6} +!6 = !{!"", !1, !"_ZTS1C", !7, null, null, null, null, null} ; [ DW_TAG_subprogram ] [line 4] [public] [test] +!7 = !{!"", null, null, null, !8, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!8 = !{!9, !10, !11, !11, !11, null} +!9 = !{!"", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed] +!10 = !{!"", null, null, !"_ZTS1C"} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from _ZTS1C] +!11 = !{!"0x24\00char\000\008\008\000\000\006", null, null} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char] +!12 = !{!13} +!13 = !{!"0x2e\00test_with_debug\00test_with_debug\00test_with_debug\006\000\001\000\000\00256\001\006", !1, !14, !15, null, void ()* @test_with_debug, null, null, !17} ; [ DW_TAG_subprogram ] [line 6] [def] [test_with_debug] +!14 = !{!"0x29", !1} +!15 = !{!"0x15\00\000\000\000\000\000\000", null, null, null, !16, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!16 = !{null} +!17 = !{!18, !19} +!18 = !{!"0x100\00c\007\000", !13, !14, !"_ZTS1C"} ; [ DW_TAG_auto_variable ] [c] [line 7] +!19 = !{!"0x100\00lc\008\000", !13, !14, !11} ; [ DW_TAG_auto_variable ] [lc] [line 8] +!20 = !{!21} +!21 = !{!"0x34\00argc\00argc\00\001\000\001", null, !14, !11, i8* @argc, null} ; [ DW_TAG_variable ] [argc] [line 1] [def] +!22 = !{i32 2, !"Dwarf Version", i32 4} +!23 = !{i32 2, !"Debug Info Version", i32 2} +!25 = !MDLocation(line: 8, column: 3, scope: !13) +!29 = !{!"0x102"} ; [ DW_TAG_expression ] diff --git a/test/CodeGen/X86/misched-copy.ll b/test/CodeGen/X86/misched-copy.ll index 4485b8a244a8..3e3729285d27 100644 --- a/test/CodeGen/X86/misched-copy.ll +++ b/test/CodeGen/X86/misched-copy.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s ; ; Test scheduling of copy instructions. ; @@ -44,6 +44,6 @@ end: attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } -!0 = metadata !{metadata !"float", metadata !1} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA"} +!0 = !{!"float", !1} +!1 = !{!"omnipotent char", !2} +!2 = !{!"Simple C/C++ TBAA"} diff --git a/test/CodeGen/X86/misched-crash.ll b/test/CodeGen/X86/misched-crash.ll index 7644ee070878..21c3fa3510d6 100644 --- a/test/CodeGen/X86/misched-crash.ll +++ b/test/CodeGen/X86/misched-crash.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -enable-misched -verify-misched +; RUN: llc < %s -verify-machineinstrs -enable-misched -verify-misched target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10" diff --git a/test/CodeGen/X86/misched-matmul.ll b/test/CodeGen/X86/misched-matmul.ll index 3ea6512258d4..5454b7cf780a 100644 --- a/test/CodeGen/X86/misched-matmul.ll +++ b/test/CodeGen/X86/misched-matmul.ll @@ -10,7 +10,7 @@ ; more complex cases. ; ; CHECK: @wrap_mul4 -; CHECK: 22 regalloc - Number of spills inserted +; CHECK: 23 regalloc - Number of spills inserted define void @wrap_mul4(double* nocapture %Out, [4 x double]* nocapture %A, [4 x double]* nocapture %B) #0 { entry: diff --git a/test/CodeGen/X86/movgs.ll b/test/CodeGen/X86/movgs.ll index 71b0723c429e..96c5dbb8ea98 100644 --- a/test/CodeGen/X86/movgs.ll +++ b/test/CodeGen/X86/movgs.ll @@ -3,40 +3,58 @@ ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X64 define i32 @test1() nounwind readonly { +; X32-LABEL: test1: +; X32: # BB#0: # %entry +; X32-NEXT: movl %gs:196, %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: retl +; +; X64-LABEL: test1: +; X64: # BB#0: # %entry +; X64-NEXT: movq %gs:320, %rax +; X64-NEXT: movl (%rax), %eax +; X64-NEXT: retq entry: %tmp = load i32* addrspace(256)* getelementptr (i32* addrspace(256)* inttoptr (i32 72 to i32* addrspace(256)*), i32 31) ; <i32*> [#uses=1] %tmp1 = load i32* %tmp ; <i32> [#uses=1] ret i32 %tmp1 } -; X32-LABEL: test1: -; X32: movl %gs:196, %eax -; X32: movl (%eax), %eax -; X32: ret - -; X64-LABEL: test1: -; X64: movq %gs:320, %rax -; X64: movl (%rax), %eax -; X64: ret define i64 @test2(void (i8*)* addrspace(256)* %tmp8) nounwind { +; X32-LABEL: test2: +; X32: # BB#0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: calll *%gs:(%eax) +; X32-NEXT: xorl %eax, %eax +; X32-NEXT: xorl %edx, %edx +; X32-NEXT: addl $12, %esp +; X32-NEXT: retl +; +; X64-LABEL: test2: +; X64: # BB#0: # %entry +; X64-NEXT: {{(subq.*%rsp|pushq)}} +; X64-NEXT: callq *%gs:(%{{(rcx|rdi)}}) +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: {{(addq.*%rsp|popq)}} +; X64-NEXT: retq entry: %tmp9 = load void (i8*)* addrspace(256)* %tmp8, align 8 tail call void %tmp9(i8* undef) nounwind optsize ret i64 0 } -; rdar://8453210 -; X32-LABEL: test2: -; X32: movl {{.*}}(%esp), %eax -; X32: calll *%gs:(%eax) - -; X64-LABEL: test2: -; X64: callq *%gs:([[A0:%rdi|%rcx]]) - - - - define <2 x i64> @pmovsxwd_1(i64 addrspace(256)* %p) nounwind readonly { +; X32-LABEL: pmovsxwd_1: +; X32: # BB#0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: pmovsxwd %gs:(%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: pmovsxwd_1: +; X64: # BB#0: # %entry +; X64-NEXT: pmovsxwd %gs:(%{{(rcx|rdi)}}), %xmm0 +; X64-NEXT: retq entry: %0 = load i64 addrspace(256)* %p %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 @@ -44,20 +62,26 @@ entry: %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone %3 = bitcast <4 x i32> %2 to <2 x i64> ret <2 x i64> %3 - -; X32-LABEL: pmovsxwd_1: -; X32: movl 4(%esp), %eax -; X32: pmovsxwd %gs:(%eax), %xmm0 -; X32: ret - -; X64-LABEL: pmovsxwd_1: -; X64: pmovsxwd %gs:([[A0]]), %xmm0 -; X64: ret } ; The two loads here both look identical to selection DAG, except for their ; address spaces. Make sure they aren't CSE'd. define i32 @test_no_cse() nounwind readonly { +; X32-LABEL: test_no_cse: +; X32: # BB#0: # %entry +; X32-NEXT: movl %gs:196, %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: movl %fs:196, %ecx +; X32-NEXT: addl (%ecx), %eax +; X32-NEXT: retl +; +; X64-LABEL: test_no_cse: +; X64: # BB#0: # %entry +; X64-NEXT: movq %gs:320, %rax +; X64-NEXT: movl (%rax), %eax +; X64-NEXT: movq %fs:320, %rcx +; X64-NEXT: addl (%rcx), %eax +; X64-NEXT: retq entry: %tmp = load i32* addrspace(256)* getelementptr (i32* addrspace(256)* inttoptr (i32 72 to i32* addrspace(256)*), i32 31) ; <i32*> [#uses=1] %tmp1 = load i32* %tmp ; <i32> [#uses=1] @@ -66,9 +90,5 @@ entry: %tmp4 = add i32 %tmp1, %tmp3 ret i32 %tmp4 } -; X32-LABEL: test_no_cse: -; X32: movl %gs:196 -; X32: movl %fs:196 -; X32: ret declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone diff --git a/test/CodeGen/X86/movntdq-no-avx.ll b/test/CodeGen/X86/movntdq-no-avx.ll index 8b7e6ef15258..cc35e201e6b3 100644 --- a/test/CodeGen/X86/movntdq-no-avx.ll +++ b/test/CodeGen/X86/movntdq-no-avx.ll @@ -9,4 +9,4 @@ entry: ret void } -!0 = metadata !{i32 1} +!0 = !{i32 1} diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll new file mode 100644 index 000000000000..cb48ed747be4 --- /dev/null +++ b/test/CodeGen/X86/movtopush.ll @@ -0,0 +1,112 @@ +; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL +; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED +declare void @good(i32 %a, i32 %b, i32 %c, i32 %d) +declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d) + +; Here, we should have a reserved frame, so we don't expect pushes +; NORMAL-LABEL: test1 +; NORMAL: subl $16, %esp +; NORMAL-NEXT: movl $4, 12(%esp) +; NORMAL-NEXT: movl $3, 8(%esp) +; NORMAL-NEXT: movl $2, 4(%esp) +; NORMAL-NEXT: movl $1, (%esp) +; NORMAL-NEXT: call +define void @test1() { +entry: + call void @good(i32 1, i32 2, i32 3, i32 4) + ret void +} + +; Here, we expect a sequence of 4 immediate pushes +; NORMAL-LABEL: test2 +; NORMAL-NOT: subl {{.*}} %esp +; NORMAL: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl $1 +; NORMAL-NEXT: call +define void @test2(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @good(i32 1, i32 2, i32 3, i32 4) + ret void +} + +; Again, we expect a sequence of 4 immediate pushes +; Checks that we generate the right pushes for >8bit immediates +; NORMAL-LABEL: test2b +; NORMAL-NOT: subl {{.*}} %esp +; NORMAL: pushl $4096 +; NORMAL-NEXT: pushl $3072 +; NORMAL-NEXT: pushl $2048 +; NORMAL-NEXT: pushl $1024 +; NORMAL-NEXT: call +define void @test2b(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @good(i32 1024, i32 2048, i32 3072, i32 4096) + ret void +} + +; The first push should push a register +; NORMAL-LABEL: test3 +; NORMAL-NOT: subl {{.*}} %esp +; NORMAL: pushl $4 +; NORMAL-NEXT: pushl $3 +; NORMAL-NEXT: pushl $2 +; NORMAL-NEXT: pushl %e{{..}} +; NORMAL-NEXT: call +define void @test3(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @good(i32 %k, i32 2, i32 3, i32 4) + ret void +} + +; We don't support weird calling conventions +; NORMAL-LABEL: test4 +; NORMAL: subl $12, %esp +; NORMAL-NEXT: movl $4, 8(%esp) +; NORMAL-NEXT: movl $3, 4(%esp) +; NORMAL-NEXT: movl $1, (%esp) +; NORMAL-NEXT: movl $2, %eax +; NORMAL-NEXT: call +define void @test4(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @inreg(i32 1, i32 2, i32 3, i32 4) + ret void +} + +; Check that additional alignment is added when the pushes +; don't add up to the required alignment. +; ALIGNED-LABEL: test5 +; ALIGNED: subl $16, %esp +; ALIGNED-NEXT: pushl $4 +; ALIGNED-NEXT: pushl $3 +; ALIGNED-NEXT: pushl $2 +; ALIGNED-NEXT: pushl $1 +; ALIGNED-NEXT: call +define void @test5(i32 %k) { +entry: + %a = alloca i32, i32 %k + call void @good(i32 1, i32 2, i32 3, i32 4) + ret void +} + +; Check that pushing the addresses of globals (Or generally, things that +; aren't exactly immediates) isn't broken. +; Fixes PR21878. +; NORMAL-LABEL: test6 +; NORMAL: pushl $_ext +; NORMAL-NEXT: call +declare void @f(i8*) +@ext = external constant i8 + +define void @test6() { + call void @f(i8* @ext) + br label %bb +bb: + alloca i32 + ret void +} diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll index 69105158906f..f0bdbba50ef3 100644 --- a/test/CodeGen/X86/ms-inline-asm.ll +++ b/test/CodeGen/X86/ms-inline-asm.ll @@ -110,7 +110,7 @@ define i32 @t31() { entry: %val = alloca i32, align 64 store i32 -1, i32* %val, align 64 - call void asm sideeffect inteldialect "mov dword ptr $0, esp", "=*m,~{dirflag},~{fpsr},~{flags}"(i32* %val) #1 + call void asm sideeffect inteldialect "mov dword ptr $0, esp", "=*m,~{dirflag},~{fpsr},~{flags}"(i32* %val) %sp = load i32* %val, align 64 ret i32 %sp ; CHECK-LABEL: t31: @@ -125,3 +125,12 @@ entry: ; CHECK: movl (%esp), %eax ; CHECK: ret } + +declare hidden void @other_func() + +define void @naked() #0 { + call void asm sideeffect inteldialect "call dword ptr $0", "*m,~{eax},~{ebx},~{ecx},~{edx},~{edi},~{esi},~{esp},~{ebp},~{dirflag},~{fpsr},~{flags}"(void()* @other_func) + unreachable +} + +attributes #0 = { naked } diff --git a/test/CodeGen/X86/musttail-fastcall.ll b/test/CodeGen/X86/musttail-fastcall.ll new file mode 100644 index 000000000000..c7e5ffcfa877 --- /dev/null +++ b/test/CodeGen/X86/musttail-fastcall.ll @@ -0,0 +1,109 @@ +; RUN: llc < %s -mtriple=i686-pc-win32 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2 +; RUN: llc < %s -mtriple=i686-pc-win32 -mattr=+sse2,+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX +; RUN: llc < %s -mtriple=i686-pc-win32 -mattr=+sse2,+avx,+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 + +; While we don't support varargs with fastcall, we do support forwarding. + +@asdf = internal constant [4 x i8] c"asdf" + +declare void @puts(i8*) + +define i32 @call_fast_thunk() { + %r = call x86_fastcallcc i32 (...)* @fast_thunk(i32 inreg 1, i32 inreg 2, i32 3) + ret i32 %r +} + +define x86_fastcallcc i32 @fast_thunk(...) { + call void @puts(i8* getelementptr ([4 x i8]* @asdf, i32 0, i32 0)) + %r = musttail call x86_fastcallcc i32 (...)* bitcast (i32 (i32, i32, i32)* @fast_target to i32 (...)*) (...) + ret i32 %r +} + +; Check that we spill and fill around the call to puts. + +; CHECK-LABEL: @fast_thunk@0: +; CHECK-DAG: movl %ecx, {{.*}} +; CHECK-DAG: movl %edx, {{.*}} +; CHECK: calll _puts +; CHECK-DAG: movl {{.*}}, %ecx +; CHECK-DAG: movl {{.*}}, %edx +; CHECK: jmp @fast_target@12 + +define x86_fastcallcc i32 @fast_target(i32 inreg %a, i32 inreg %b, i32 %c) { + %a0 = add i32 %a, %b + %a1 = add i32 %a0, %c + ret i32 %a1 +} + +; Repeat the test for vectorcall, which has XMM registers. + +define i32 @call_vector_thunk() { + %r = call x86_vectorcallcc i32 (...)* @vector_thunk(i32 inreg 1, i32 inreg 2, i32 3) + ret i32 %r +} + +define x86_vectorcallcc i32 @vector_thunk(...) { + call void @puts(i8* getelementptr ([4 x i8]* @asdf, i32 0, i32 0)) + %r = musttail call x86_vectorcallcc i32 (...)* bitcast (i32 (i32, i32, i32)* @vector_target to i32 (...)*) (...) + ret i32 %r +} + +; Check that we spill and fill SSE registers around the call to puts. + +; CHECK-LABEL: vector_thunk@@0: +; CHECK-DAG: movl %ecx, {{.*}} +; CHECK-DAG: movl %edx, {{.*}} + +; SSE2-DAG: movups %xmm0, {{.*}} +; SSE2-DAG: movups %xmm1, {{.*}} +; SSE2-DAG: movups %xmm2, {{.*}} +; SSE2-DAG: movups %xmm3, {{.*}} +; SSE2-DAG: movups %xmm4, {{.*}} +; SSE2-DAG: movups %xmm5, {{.*}} + +; AVX-DAG: vmovups %ymm0, {{.*}} +; AVX-DAG: vmovups %ymm1, {{.*}} +; AVX-DAG: vmovups %ymm2, {{.*}} +; AVX-DAG: vmovups %ymm3, {{.*}} +; AVX-DAG: vmovups %ymm4, {{.*}} +; AVX-DAG: vmovups %ymm5, {{.*}} + +; AVX512-DAG: vmovups %zmm0, {{.*}} +; AVX512-DAG: vmovups %zmm1, {{.*}} +; AVX512-DAG: vmovups %zmm2, {{.*}} +; AVX512-DAG: vmovups %zmm3, {{.*}} +; AVX512-DAG: vmovups %zmm4, {{.*}} +; AVX512-DAG: vmovups %zmm5, {{.*}} + +; CHECK: calll _puts + +; SSE2-DAG: movups {{.*}}, %xmm0 +; SSE2-DAG: movups {{.*}}, %xmm1 +; SSE2-DAG: movups {{.*}}, %xmm2 +; SSE2-DAG: movups {{.*}}, %xmm3 +; SSE2-DAG: movups {{.*}}, %xmm4 +; SSE2-DAG: movups {{.*}}, %xmm5 + +; AVX-DAG: vmovups {{.*}}, %ymm0 +; AVX-DAG: vmovups {{.*}}, %ymm1 +; AVX-DAG: vmovups {{.*}}, %ymm2 +; AVX-DAG: vmovups {{.*}}, %ymm3 +; AVX-DAG: vmovups {{.*}}, %ymm4 +; AVX-DAG: vmovups {{.*}}, %ymm5 + +; AVX512-DAG: vmovups {{.*}}, %zmm0 +; AVX512-DAG: vmovups {{.*}}, %zmm1 +; AVX512-DAG: vmovups {{.*}}, %zmm2 +; AVX512-DAG: vmovups {{.*}}, %zmm3 +; AVX512-DAG: vmovups {{.*}}, %zmm4 +; AVX512-DAG: vmovups {{.*}}, %zmm5 + +; CHECK-DAG: movl {{.*}}, %ecx +; CHECK-DAG: movl {{.*}}, %edx +; CHECK: jmp vector_target@@12 + +define x86_vectorcallcc i32 @vector_target(i32 inreg %a, i32 inreg %b, i32 %c) { + %a0 = add i32 %a, %b + %a1 = add i32 %a0, %c + ret i32 %a1 +} diff --git a/test/CodeGen/X86/musttail-varargs.ll b/test/CodeGen/X86/musttail-varargs.ll new file mode 100644 index 000000000000..7f105a13a6a0 --- /dev/null +++ b/test/CodeGen/X86/musttail-varargs.ll @@ -0,0 +1,140 @@ +; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-linux | FileCheck %s --check-prefix=LINUX +; RUN: llc < %s -enable-tail-merge=0 -mtriple=x86_64-windows | FileCheck %s --check-prefix=WINDOWS +; RUN: llc < %s -enable-tail-merge=0 -mtriple=i686-windows | FileCheck %s --check-prefix=X86 + +; Test that we actually spill and reload all arguments in the variadic argument +; pack. Doing a normal call will clobber all argument registers, and we will +; spill around it. A simple adjustment should not require any XMM spills. + +declare void @llvm.va_start(i8*) nounwind + +declare void(i8*, ...)* @get_f(i8* %this) + +define void @f_thunk(i8* %this, ...) { + ; Use va_start so that we exercise the combination. + %ap = alloca [4 x i8*], align 16 + %ap_i8 = bitcast [4 x i8*]* %ap to i8* + call void @llvm.va_start(i8* %ap_i8) + + %fptr = call void(i8*, ...)*(i8*)* @get_f(i8* %this) + musttail call void (i8*, ...)* %fptr(i8* %this, ...) + ret void +} + +; Save and restore 6 GPRs, 8 XMMs, and AL around the call. + +; LINUX-LABEL: f_thunk: +; LINUX-DAG: movq %rdi, {{.*}} +; LINUX-DAG: movq %rsi, {{.*}} +; LINUX-DAG: movq %rdx, {{.*}} +; LINUX-DAG: movq %rcx, {{.*}} +; LINUX-DAG: movq %r8, {{.*}} +; LINUX-DAG: movq %r9, {{.*}} +; LINUX-DAG: movb %al, {{.*}} +; LINUX-DAG: movaps %xmm0, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm1, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm2, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm3, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm4, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm5, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm6, {{[0-9]*}}(%rsp) +; LINUX-DAG: movaps %xmm7, {{[0-9]*}}(%rsp) +; LINUX: callq get_f +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm0 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm1 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm2 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm3 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm4 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm5 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm6 +; LINUX-DAG: movaps {{[0-9]*}}(%rsp), %xmm7 +; LINUX-DAG: movq {{.*}}, %rdi +; LINUX-DAG: movq {{.*}}, %rsi +; LINUX-DAG: movq {{.*}}, %rdx +; LINUX-DAG: movq {{.*}}, %rcx +; LINUX-DAG: movq {{.*}}, %r8 +; LINUX-DAG: movq {{.*}}, %r9 +; LINUX-DAG: movb {{.*}}, %al +; LINUX: jmpq *{{.*}} # TAILCALL + +; WINDOWS-LABEL: f_thunk: +; WINDOWS-NOT: mov{{.}}ps +; WINDOWS-DAG: movq %rdx, {{.*}} +; WINDOWS-DAG: movq %rcx, {{.*}} +; WINDOWS-DAG: movq %r8, {{.*}} +; WINDOWS-DAG: movq %r9, {{.*}} +; WINDOWS-NOT: mov{{.}}ps +; WINDOWS: callq get_f +; WINDOWS-NOT: mov{{.}}ps +; WINDOWS-DAG: movq {{.*}}, %rdx +; WINDOWS-DAG: movq {{.*}}, %rcx +; WINDOWS-DAG: movq {{.*}}, %r8 +; WINDOWS-DAG: movq {{.*}}, %r9 +; WINDOWS-NOT: mov{{.}}ps +; WINDOWS: jmpq *{{.*}} # TAILCALL + +; No regparms on normal x86 conventions. + +; X86-LABEL: _f_thunk: +; X86: calll _get_f +; X86: jmpl *{{.*}} # TAILCALL + +; This thunk shouldn't require any spills and reloads, assuming the register +; allocator knows what it's doing. + +define void @g_thunk(i8* %fptr_i8, ...) { + %fptr = bitcast i8* %fptr_i8 to void (i8*, ...)* + musttail call void (i8*, ...)* %fptr(i8* %fptr_i8, ...) + ret void +} + +; LINUX-LABEL: g_thunk: +; LINUX-NOT: movq +; LINUX: jmpq *%rdi # TAILCALL + +; WINDOWS-LABEL: g_thunk: +; WINDOWS-NOT: movq +; WINDOWS: jmpq *%rcx # TAILCALL + +; X86-LABEL: _g_thunk: +; X86: jmpl *%eax # TAILCALL + +; Do a simple multi-exit multi-bb test. + +%struct.Foo = type { i1, i8*, i8* } + +@g = external global i32 + +define void @h_thunk(%struct.Foo* %this, ...) { + %cond_p = getelementptr %struct.Foo* %this, i32 0, i32 0 + %cond = load i1* %cond_p + br i1 %cond, label %then, label %else + +then: + %a_p = getelementptr %struct.Foo* %this, i32 0, i32 1 + %a_i8 = load i8** %a_p + %a = bitcast i8* %a_i8 to void (%struct.Foo*, ...)* + musttail call void (%struct.Foo*, ...)* %a(%struct.Foo* %this, ...) + ret void + +else: + %b_p = getelementptr %struct.Foo* %this, i32 0, i32 2 + %b_i8 = load i8** %b_p + %b = bitcast i8* %b_i8 to void (%struct.Foo*, ...)* + store i32 42, i32* @g + musttail call void (%struct.Foo*, ...)* %b(%struct.Foo* %this, ...) + ret void +} + +; LINUX-LABEL: h_thunk: +; LINUX: jne +; LINUX: jmpq *{{.*}} # TAILCALL +; LINUX: jmpq *{{.*}} # TAILCALL +; WINDOWS-LABEL: h_thunk: +; WINDOWS: jne +; WINDOWS: jmpq *{{.*}} # TAILCALL +; WINDOWS: jmpq *{{.*}} # TAILCALL +; X86-LABEL: _h_thunk: +; X86: jne +; X86: jmpl *{{.*}} # TAILCALL +; X86: jmpl *{{.*}} # TAILCALL diff --git a/test/CodeGen/X86/named-reg-alloc.ll b/test/CodeGen/X86/named-reg-alloc.ll index 9463ea377a9d..c33b4eb75d04 100644 --- a/test/CodeGen/X86/named-reg-alloc.ll +++ b/test/CodeGen/X86/named-reg-alloc.ll @@ -11,4 +11,4 @@ entry: declare i32 @llvm.read_register.i32(metadata) nounwind -!0 = metadata !{metadata !"eax\00"} +!0 = !{!"eax\00"} diff --git a/test/CodeGen/X86/named-reg-notareg.ll b/test/CodeGen/X86/named-reg-notareg.ll index d85ddddbea85..18c517d87810 100644 --- a/test/CodeGen/X86/named-reg-notareg.ll +++ b/test/CodeGen/X86/named-reg-notareg.ll @@ -10,4 +10,4 @@ entry: declare i32 @llvm.read_register.i32(metadata) nounwind -!0 = metadata !{metadata !"notareg\00"} +!0 = !{!"notareg\00"} diff --git a/test/CodeGen/X86/nancvt.ll b/test/CodeGen/X86/nancvt.ll index 8036710b225a..8a665fa79cff 100644 --- a/test/CodeGen/X86/nancvt.ll +++ b/test/CodeGen/X86/nancvt.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -std-compile-opts | llc > %t +; RUN: opt < %s -O3 | llc > %t ; RUN: grep 2147027116 %t | count 3 ; RUN: grep 2147228864 %t | count 3 ; RUN: grep 2146502828 %t | count 3 diff --git a/test/CodeGen/X86/narrow-shl-load.ll b/test/CodeGen/X86/narrow-shl-load.ll index 30387925b34d..5175bfc2bcb1 100644 --- a/test/CodeGen/X86/narrow-shl-load.ll +++ b/test/CodeGen/X86/narrow-shl-load.ll @@ -30,40 +30,6 @@ while.end: ; preds = %while.cond ret void } - -; DAGCombiner shouldn't fold the sdiv (ashr) away. -; rdar://8636812 -; CHECK-LABEL: test2: -; CHECK: sarl - -define i32 @test2() nounwind { -entry: - %i = alloca i32, align 4 - %j = alloca i8, align 1 - store i32 127, i32* %i, align 4 - store i8 0, i8* %j, align 1 - %tmp3 = load i32* %i, align 4 - %mul = mul nsw i32 %tmp3, 2 - %conv4 = trunc i32 %mul to i8 - %conv5 = sext i8 %conv4 to i32 - %div6 = sdiv i32 %conv5, 2 - %conv7 = trunc i32 %div6 to i8 - %conv9 = sext i8 %conv7 to i32 - %cmp = icmp eq i32 %conv9, -1 - br i1 %cmp, label %if.then, label %if.end - -if.then: ; preds = %entry - ret i32 0 - -if.end: ; preds = %entry - call void @abort() noreturn - unreachable -} - -declare void @abort() noreturn - -declare void @exit(i32) noreturn - ; DAG Combiner can't fold this into a load of the 1'th byte. ; PR8757 define i32 @test3(i32 *%P) nounwind ssp { diff --git a/test/CodeGen/X86/no-compact-unwind.ll b/test/CodeGen/X86/no-compact-unwind.ll deleted file mode 100644 index 991cd4ed7363..000000000000 --- a/test/CodeGen/X86/no-compact-unwind.ll +++ /dev/null @@ -1,64 +0,0 @@ -; RUN: llc < %s -mtriple x86_64-apple-macosx10.8.0 -mcpu corei7 -filetype=obj -o - \ -; RUN: | llvm-objdump -triple x86_64-apple-macosx10.8.0 -s - \ -; RUN: | FileCheck -check-prefix=CU %s -; RUN: llc < %s -mtriple x86_64-apple-darwin11 -mcpu corei7 \ -; RUN: | llvm-mc -triple x86_64-apple-darwin11 -filetype=obj -o - \ -; RUN: | llvm-objdump -triple x86_64-apple-darwin11 -s - \ -; RUN: | FileCheck -check-prefix=FROM-ASM %s - -%"struct.dyld::MappedRanges" = type { [400 x %struct.anon], %"struct.dyld::MappedRanges"* } -%struct.anon = type { %class.ImageLoader*, i64, i64 } -%class.ImageLoader = type { i32 (...)**, i8*, i8*, i32, i64, i64, i32, i32, %"struct.ImageLoader::recursive_lock"*, i16, i16, [4 x i8] } -%"struct.ImageLoader::recursive_lock" = type { i32, i32 } - -@G1 = external hidden global %"struct.dyld::MappedRanges", align 8 - -declare void @OSMemoryBarrier() optsize - -; This compact unwind encoding indicates that we could not generate correct -; compact unwind encodings for this function. This then defaults to using the -; DWARF EH frame. - -; CU: Contents of section __compact_unwind: -; CU-NEXT: 0048 00000000 00000000 42000000 00000004 -; CU-NEXT: 0058 00000000 00000000 00000000 00000000 - -; FROM-ASM: Contents of section __compact_unwind: -; FROM-ASM-NEXT: 0048 00000000 00000000 42000000 00000004 -; FROM-ASM-NEXT: 0058 00000000 00000000 00000000 00000000 - -define void @func(%class.ImageLoader* %image) optsize ssp uwtable { -entry: - br label %for.cond1.preheader - -for.cond1.preheader: ; preds = %for.inc10, %entry - %p.019 = phi %"struct.dyld::MappedRanges"* [ @G1, %entry ], [ %1, %for.inc10 ] - br label %for.body3 - -for.body3: ; preds = %for.inc, %for.cond1.preheader - %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.inc ] - %image4 = getelementptr inbounds %"struct.dyld::MappedRanges"* %p.019, i64 0, i32 0, i64 %indvars.iv, i32 0 - %0 = load %class.ImageLoader** %image4, align 8 - %cmp5 = icmp eq %class.ImageLoader* %0, %image - br i1 %cmp5, label %if.then, label %for.inc - -if.then: ; preds = %for.body3 - tail call void @OSMemoryBarrier() optsize - store %class.ImageLoader* null, %class.ImageLoader** %image4, align 8 - br label %for.inc - -for.inc: ; preds = %if.then, %for.body3 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 400 - br i1 %exitcond, label %for.inc10, label %for.body3 - -for.inc10: ; preds = %for.inc - %next = getelementptr inbounds %"struct.dyld::MappedRanges"* %p.019, i64 0, i32 1 - %1 = load %"struct.dyld::MappedRanges"** %next, align 8 - %cmp = icmp eq %"struct.dyld::MappedRanges"* %1, null - br i1 %cmp, label %for.end11, label %for.cond1.preheader - -for.end11: ; preds = %for.inc10 - ret void -} diff --git a/test/CodeGen/X86/nonconst-static-ev.ll b/test/CodeGen/X86/nonconst-static-ev.ll index f852caeeea21..5449791f3fa3 100644 --- a/test/CodeGen/X86/nonconst-static-ev.ll +++ b/test/CodeGen/X86/nonconst-static-ev.ll @@ -1,6 +1,5 @@ ; RUN: not llc -march=x86 -mtriple=x86_64-linux-gnu < %s 2> %t ; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s -; REQUIRES: shell @0 = global i8 extractvalue ([1 x i8] select (i1 ptrtoint (i32* @1 to i1), [1 x i8] [ i8 1 ], [1 x i8] [ i8 2 ]), 0) @1 = external global i32 diff --git a/test/CodeGen/X86/nonconst-static-iv.ll b/test/CodeGen/X86/nonconst-static-iv.ll index 8fad39bcbf72..30613ef383a3 100644 --- a/test/CodeGen/X86/nonconst-static-iv.ll +++ b/test/CodeGen/X86/nonconst-static-iv.ll @@ -1,6 +1,5 @@ ; RUN: not llc -march=x86 -mtriple=x86_64-linux-gnu < %s 2> %t ; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s -; REQUIRES: shell @0 = global i8 insertvalue( { i8 } select (i1 ptrtoint (i32* @1 to i1), { i8 } { i8 1 }, { i8 } { i8 2 }), i8 0, 0) @1 = external global i32 diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll new file mode 100644 index 000000000000..f62f3725d7d8 --- /dev/null +++ b/test/CodeGen/X86/nontemporal-2.ll @@ -0,0 +1,31 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX + + +; Make sure that we generate non-temporal stores for the test cases below. + +define void @test1(<4 x float>* %dst) { +; CHECK-LABEL: test1: +; SSE: movntps +; AVX: vmovntps + store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1 + ret void +} + +define void @test2(<4 x i32>* %dst) { +; CHECK-LABEL: test2: +; SSE: movntps +; AVX: vmovntps + store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1 + ret void +} + +define void @test3(<2 x double>* %dst) { +; CHECK-LABEL: test3: +; SSE: movntps +; AVX: vmovntps + store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1 + ret void +} + +!1 = !{i32 1} diff --git a/test/CodeGen/X86/nontemporal.ll b/test/CodeGen/X86/nontemporal.ll index ae04435ac39c..f9385df36421 100644 --- a/test/CodeGen/X86/nontemporal.ll +++ b/test/CodeGen/X86/nontemporal.ll @@ -19,4 +19,4 @@ define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E) { ret void } -!0 = metadata !{i32 1} +!0 = !{i32 1} diff --git a/test/CodeGen/X86/norex-subreg.ll b/test/CodeGen/X86/norex-subreg.ll index 2c529fdf1039..fb41dede287f 100644 --- a/test/CodeGen/X86/norex-subreg.ll +++ b/test/CodeGen/X86/norex-subreg.ll @@ -1,5 +1,5 @@ -; RUN: llc -O0 < %s -; RUN: llc < %s +; RUN: llc -O0 < %s -verify-machineinstrs +; RUN: llc < %s -verify-machineinstrs target triple = "x86_64-apple-macosx10.7" ; This test case extracts a sub_8bit_hi sub-register: diff --git a/test/CodeGen/X86/null-streamer.ll b/test/CodeGen/X86/null-streamer.ll index fa77fcb1d138..f6eb0e15aabb 100644 --- a/test/CodeGen/X86/null-streamer.ll +++ b/test/CodeGen/X86/null-streamer.ll @@ -14,16 +14,16 @@ define void @f1() { !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!11, !13} -!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !" ", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !9, metadata !2, metadata !""} -!1 = metadata !{metadata !"", metadata !""} -!2 = metadata !{} -!3 = metadata !{metadata !4} -!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"", metadata !"", metadata !"", i32 2, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* null, null, null, metadata !2, i32 2} -!5 = metadata !{i32 786473, metadata !1} -!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} -!7 = metadata !{metadata !8} -!8 = metadata !{i32 786468, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} -!9 = metadata !{metadata !10} -!10 = metadata !{i32 786484, i32 0, null, metadata !"i", metadata !"i", metadata !"_ZL1i", metadata !5, i32 1, metadata !8, i32 1, i32 1, null, null} -!11 = metadata !{i32 2, metadata !"Dwarf Version", i32 3} -!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x11\004\00 \001\00\000\00\000", !1, !2, !2, !3, !9, !2} ; [ DW_TAG_compile_unit ] +!1 = !{!"", !""} +!2 = !{} +!3 = !{!4} +!4 = !{!"0x2e\00\00\00\002\000\001\000\006\00256\001\002", !1, !5, !6, null, i32 ()* null, null, null, !2} ; [ DW_TAG_subprogram ] +!5 = !{!"0x29", !1} ; [ DW_TAG_file_type ] +!6 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !7, null, null, null} ; [ DW_TAG_subroutine_type ] +!7 = !{!8} +!8 = !{!"0x24\00\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] +!9 = !{!10} +!10 = !{!"0x34\00i\00i\00_ZL1i\001\001\001", null, !5, !8, null, null} ; [ DW_TAG_variable ] +!11 = !{i32 2, !"Dwarf Version", i32 3} +!13 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/objc-gc-module-flags.ll b/test/CodeGen/X86/objc-gc-module-flags.ll index 8cb2c036a4f7..f197510f2207 100644 --- a/test/CodeGen/X86/objc-gc-module-flags.ll +++ b/test/CodeGen/X86/objc-gc-module-flags.ll @@ -7,7 +7,7 @@ !llvm.module.flags = !{!0, !1, !2, !3} -!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2} -!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0} -!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"} -!3 = metadata !{i32 1, metadata !"Objective-C Garbage Collection", i32 2} +!0 = !{i32 1, !"Objective-C Version", i32 2} +!1 = !{i32 1, !"Objective-C Image Info Version", i32 0} +!2 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"} +!3 = !{i32 1, !"Objective-C Garbage Collection", i32 2} diff --git a/test/CodeGen/X86/object-size.ll b/test/CodeGen/X86/object-size.ll index ec35d2981a16..0610f0b6de2e 100644 --- a/test/CodeGen/X86/object-size.ll +++ b/test/CodeGen/X86/object-size.ll @@ -1,4 +1,4 @@ -; RUN: llc -O0 < %s -march=x86-64 | FileCheck %s -check-prefix=X64 +; RUN: llc -O0 < %s -march=x86-64 | FileCheck %s ; ModuleID = 'ts.c' target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" @@ -12,8 +12,8 @@ entry: %tmp = load i8** @p ; <i8*> [#uses=1] %0 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp, i1 0) ; <i64> [#uses=1] %cmp = icmp ne i64 %0, -1 ; <i1> [#uses=1] -; X64: movabsq $-1, [[RAX:%r..]] -; X64: cmpq $-1, [[RAX]] +; CHECK: movq $-1, [[RAX:%r..]] +; CHECK: cmpq $-1, [[RAX]] br i1 %cmp, label %cond.true, label %cond.false cond.true: ; preds = %entry diff --git a/test/CodeGen/X86/osx-private-labels.ll b/test/CodeGen/X86/osx-private-labels.ll index 349ce7d0cc5e..e30cb4824aa7 100644 --- a/test/CodeGen/X86/osx-private-labels.ll +++ b/test/CodeGen/X86/osx-private-labels.ll @@ -69,3 +69,20 @@ ; CHECK: .section __DATA,__foobar,interposing ; CHECK-NEXT: .align 3 ; CHECK-NEXT: L_private12: + +@private13 = private global i32 42, section "__DATA, __objc_classlist, regular, no_dead_strip" +; CHECK: .section __DATA,__objc_classlist,regular,no_dead_strip +; CHECK-NEXT: .align 2 +; CHECK-NEXT: L_private13: + +@private14 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_classname,cstring_literals" +; CHECK: .section __TEXT,__objc_classname,cstring_literals +; CHECK-NEXT: L_private14: + +@private15 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_methname,cstring_literals" +; CHECK: .section __TEXT,__objc_methname,cstring_literals +; CHECK-NEXT: L_private15: + +@private16 = private global [4 x i8] c"zed\00", section "__TEXT,__objc_methtype,cstring_literals" +; CHECK: .section __TEXT,__objc_methtype,cstring_literals +; CHECK-NEXT: L_private16: diff --git a/test/CodeGen/X86/palignr.ll b/test/CodeGen/X86/palignr.ll index ec6564d7e2eb..3efcc2e41215 100644 --- a/test/CodeGen/X86/palignr.ll +++ b/test/CodeGen/X86/palignr.ll @@ -3,58 +3,127 @@ define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: test1: -; CHECK: pshufd -; CHECK-YONAH: pshufd +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,3,0] +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test1: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,3,0] +; CHECK-YONAH-NEXT: retl %C = shufflevector <4 x i32> %A, <4 x i32> undef, <4 x i32> < i32 1, i32 2, i32 3, i32 0 > ret <4 x i32> %C } define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: test2: -; CHECK: palignr -; CHECK-YONAH: shufps +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test2: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; CHECK-YONAH-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0] +; CHECK-YONAH-NEXT: retl %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 3, i32 4 > ret <4 x i32> %C } define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: test3: -; CHECK: palignr +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test3: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0] +; CHECK-YONAH-NEXT: retl %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 undef, i32 4 > ret <4 x i32> %C } define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: test4: -; CHECK: palignr +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test4: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; CHECK-YONAH-NEXT: movapd %xmm1, %xmm0 +; CHECK-YONAH-NEXT: retl %C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 > ret <4 x i32> %C } define <4 x float> @test5(<4 x float> %A, <4 x float> %B) nounwind { ; CHECK-LABEL: test5: -; CHECK: palignr +; CHECK: # BB#0: +; CHECK-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test5: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; CHECK-YONAH-NEXT: movapd %xmm1, %xmm0 +; CHECK-YONAH-NEXT: retl %C = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 > ret <4 x float> %C } define <8 x i16> @test6(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: test6: -; CHECK: palignr +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test6: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; CHECK-YONAH-NEXT: por %xmm1, %xmm0 +; CHECK-YONAH-NEXT: retl %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 3, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10 > ret <8 x i16> %C } define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: test7: -; CHECK: palignr +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test7: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; CHECK-YONAH-NEXT: por %xmm1, %xmm0 +; CHECK-YONAH-NEXT: retl %C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 undef, i32 6, i32 undef, i32 8, i32 9, i32 10, i32 11, i32 12 > ret <8 x i16> %C } define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind { ; CHECK-LABEL: test8: -; CHECK: palignr +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test8: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero +; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; CHECK-YONAH-NEXT: por %xmm1, %xmm0 +; CHECK-YONAH-NEXT: retl %C = shufflevector <16 x i8> %A, <16 x i8> %B, <16 x i32> < i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20 > ret <16 x i8> %C } @@ -65,8 +134,19 @@ define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind { ; was an UNDEF.) define <8 x i16> @test9(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: test9: -; CHECK-NOT: palignr -; CHECK: pshufb +; CHECK: # BB#0: +; CHECK-NEXT: palignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-YONAH-LABEL: test9: +; CHECK-YONAH: # BB#0: +; CHECK-YONAH-NEXT: movdqa %xmm1, %xmm0 +; CHECK-YONAH-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; CHECK-YONAH-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] +; CHECK-YONAH-NEXT: por %xmm0, %xmm1 +; CHECK-YONAH-NEXT: movdqa %xmm1, %xmm0 +; CHECK-YONAH-NEXT: retl %C = shufflevector <8 x i16> %B, <8 x i16> %A, <8 x i32> < i32 undef, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0 > ret <8 x i16> %C } diff --git a/test/CodeGen/X86/patchpoint-invoke.ll b/test/CodeGen/X86/patchpoint-invoke.ll new file mode 100644 index 000000000000..192cacc908ab --- /dev/null +++ b/test/CodeGen/X86/patchpoint-invoke.ll @@ -0,0 +1,63 @@ +; RUN: llc -mtriple=x86_64-unknown-linux -mcpu=corei7 < %s | FileCheck %s + +; Test invoking of patchpoints +; +define i64 @patchpoint_invoke(i64 %p1, i64 %p2) { +entry: +; CHECK-LABEL: patchpoint_invoke: +; CHECK-NEXT: .cfi_startproc +; CHECK: [[FUNC_BEGIN:.L.*]]: +; CHECK: .cfi_lsda 3, [[EXCEPTION_LABEL:.L[^ ]*]] +; CHECK: pushq %rbp + +; Unfortunately, hardcode the name of the label that begins the patchpoint: +; CHECK: .Ltmp0: +; CHECK: movabsq $-559038736, %r11 +; CHECK-NEXT: callq *%r11 +; CHECK-NEXT: xchgw %ax, %ax +; CHECK-NEXT: [[PP_END:.L.*]]: +; CHECK: ret + %resolveCall = inttoptr i64 -559038736 to i8* + %result = invoke i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall, i32 1, i64 %p1, i64 %p2) + to label %success unwind label %threw + +success: + ret i64 %result + +threw: + %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + catch i8* null + ret i64 0 +} + +; Verify that the exception table was emitted: +; CHECK: [[EXCEPTION_LABEL]]: +; CHECK-NEXT: .byte 255 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 21 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 13 +; Verify that the unwind data covers the entire patchpoint region: +; CHECK-NEXT: .long .Ltmp0-[[FUNC_BEGIN]] +; CHECK-NEXT: .long [[PP_END]]-.Ltmp0 + + +; Verify that the stackmap section got emitted: +; CHECK-LABEL: __LLVM_StackMaps: +; Header +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .short 0 +; Num Functions +; CHECK-NEXT: .long 1 +; Num LargeConstants +; CHECK-NEXT: .long 0 +; Num Callsites +; CHECK-NEXT: .long 1 +; CHECK-NEXT: .quad patchpoint_invoke + + +declare void @llvm.experimental.stackmap(i64, i32, ...) +declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...) +declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...) +declare i32 @__gxx_personality_v0(...) diff --git a/test/CodeGen/X86/patchpoint-webkit_jscc.ll b/test/CodeGen/X86/patchpoint-webkit_jscc.ll new file mode 100644 index 000000000000..5e76bf8d4e60 --- /dev/null +++ b/test/CodeGen/X86/patchpoint-webkit_jscc.ll @@ -0,0 +1,88 @@ +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST + +; Test the webkit_jscc calling convention. +; One argument will be passed in register, the other will be pushed on the stack. +; Return value in $rax. +define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) { +entry: +; CHECK-LABEL: jscall_patchpoint_codegen: +; CHECK: Ltmp +; CHECK: movq %r{{.+}}, (%rsp) +; CHECK: movq %r{{.+}}, %rax +; CHECK: Ltmp +; CHECK-NEXT: movabsq $-559038736, %r11 +; CHECK-NEXT: callq *%r11 +; CHECK: movq %rax, (%rsp) +; CHECK: callq +; FAST-LABEL: jscall_patchpoint_codegen: +; FAST: Ltmp +; FAST: movq %r{{.+}}, (%rsp) +; FAST: movq %r{{.+}}, %rax +; FAST: Ltmp +; FAST-NEXT: movabsq $-559038736, %r11 +; FAST-NEXT: callq *%r11 +; FAST: movq %rax, (%rsp) +; FAST: callq + %resolveCall2 = inttoptr i64 -559038736 to i8* + %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2) + %resolveCall3 = inttoptr i64 -559038737 to i8* + tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result) + ret void +} + +; Test if the arguments are properly aligned and that we don't store undef arguments. +define i64 @jscall_patchpoint_codegen2(i64 %callee) { +entry: +; CHECK-LABEL: jscall_patchpoint_codegen2: +; CHECK: Ltmp +; CHECK: movq $6, 24(%rsp) +; CHECK-NEXT: movl $4, 16(%rsp) +; CHECK-NEXT: movq $2, (%rsp) +; CHECK: Ltmp +; CHECK-NEXT: movabsq $-559038736, %r11 +; CHECK-NEXT: callq *%r11 +; FAST-LABEL: jscall_patchpoint_codegen2: +; FAST: Ltmp +; FAST: movq $2, (%rsp) +; FAST-NEXT: movl $4, 16(%rsp) +; FAST-NEXT: movq $6, 24(%rsp) +; FAST: Ltmp +; FAST-NEXT: movabsq $-559038736, %r11 +; FAST-NEXT: callq *%r11 + %call = inttoptr i64 -559038736 to i8* + %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6) + ret i64 %result +} + +; Test if the arguments are properly aligned and that we don't store undef arguments. +define i64 @jscall_patchpoint_codegen3(i64 %callee) { +entry: +; CHECK-LABEL: jscall_patchpoint_codegen3: +; CHECK: Ltmp +; CHECK: movq $10, 48(%rsp) +; CHECK-NEXT: movl $8, 36(%rsp) +; CHECK-NEXT: movq $6, 24(%rsp) +; CHECK-NEXT: movl $4, 16(%rsp) +; CHECK-NEXT: movq $2, (%rsp) +; CHECK: Ltmp +; CHECK-NEXT: movabsq $-559038736, %r11 +; CHECK-NEXT: callq *%r11 +; FAST-LABEL: jscall_patchpoint_codegen3: +; FAST: Ltmp +; FAST: movq $2, (%rsp) +; FAST-NEXT: movl $4, 16(%rsp) +; FAST-NEXT: movq $6, 24(%rsp) +; FAST-NEXT: movl $8, 36(%rsp) +; FAST-NEXT: movq $10, 48(%rsp) +; FAST: Ltmp +; FAST-NEXT: movabsq $-559038736, %r11 +; FAST-NEXT: callq *%r11 + %call = inttoptr i64 -559038736 to i8* + %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10) + ret i64 %result +} + +declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...) +declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...) + diff --git a/test/CodeGen/X86/patchpoint.ll b/test/CodeGen/X86/patchpoint.ll index 62b12732ded4..07148f0329a2 100644 --- a/test/CodeGen/X86/patchpoint.ll +++ b/test/CodeGen/X86/patchpoint.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort < %s | FileCheck %s ; Trivial patchpoint codegen ; @@ -38,61 +39,6 @@ entry: ret void } -; Test the webkit_jscc calling convention. -; One argument will be passed in register, the other will be pushed on the stack. -; Return value in $rax. -define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) { -entry: -; CHECK-LABEL: jscall_patchpoint_codegen: -; CHECK: Ltmp -; CHECK: movq %r{{.+}}, (%rsp) -; CHECK: movq %r{{.+}}, %rax -; CHECK: Ltmp -; CHECK-NEXT: movabsq $-559038736, %r11 -; CHECK-NEXT: callq *%r11 -; CHECK: movq %rax, (%rsp) -; CHECK: callq - %resolveCall2 = inttoptr i64 -559038736 to i8* - %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2) - %resolveCall3 = inttoptr i64 -559038737 to i8* - tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result) - ret void -} - -; Test if the arguments are properly aligned and that we don't store undef arguments. -define i64 @jscall_patchpoint_codegen2(i64 %callee) { -entry: -; CHECK-LABEL: jscall_patchpoint_codegen2: -; CHECK: Ltmp -; CHECK: movq $6, 24(%rsp) -; CHECK-NEXT: movl $4, 16(%rsp) -; CHECK-NEXT: movq $2, (%rsp) -; CHECK: Ltmp -; CHECK-NEXT: movabsq $-559038736, %r11 -; CHECK-NEXT: callq *%r11 - %call = inttoptr i64 -559038736 to i8* - %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6) - ret i64 %result -} - -; Test if the arguments are properly aligned and that we don't store undef arguments. -define i64 @jscall_patchpoint_codegen3(i64 %callee) { -entry: -; CHECK-LABEL: jscall_patchpoint_codegen3: -; CHECK: Ltmp -; CHECK: movq $10, 48(%rsp) -; CHECK-NEXT: movl $8, 36(%rsp) -; CHECK-NEXT: movq $6, 24(%rsp) -; CHECK-NEXT: movl $4, 16(%rsp) -; CHECK-NEXT: movq $2, (%rsp) -; CHECK: Ltmp -; CHECK-NEXT: movabsq $-559038736, %r11 -; CHECK-NEXT: callq *%r11 - %call = inttoptr i64 -559038736 to i8* - %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10) - ret i64 %result -} - ; Test patchpoints reusing the same TargetConstant. ; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4) ; There is no way to verify this, since it depends on memory allocation. @@ -125,6 +71,17 @@ entry: ret void } +; Test large target address. +define i64 @large_target_address_patchpoint_codegen() { +entry: +; CHECK-LABEL: large_target_address_patchpoint_codegen: +; CHECK: movabsq $6153737369414576827, %r11 +; CHECK-NEXT: callq *%r11 + %resolveCall2 = inttoptr i64 6153737369414576827 to i8* + %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 0) + ret i64 %result +} + declare void @llvm.experimental.stackmap(i64, i32, ...) declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...) declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...) diff --git a/test/CodeGen/X86/peep-test-2.ll b/test/CodeGen/X86/peep-test-2.ll index e4bafbb6ffab..e43b8ef54cf5 100644 --- a/test/CodeGen/X86/peep-test-2.ll +++ b/test/CodeGen/X86/peep-test-2.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86 | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -march=x86 | FileCheck %s ; CHECK: testl diff --git a/test/CodeGen/X86/peep-vector-extract-concat.ll b/test/CodeGen/X86/peep-vector-extract-concat.ll deleted file mode 100644 index f73ebb944dcd..000000000000 --- a/test/CodeGen/X86/peep-vector-extract-concat.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2,-sse4.1 | FileCheck %s -; CHECK: pshufd $3, %xmm0, %xmm0 - -; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2,-sse4.1 | FileCheck %s -check-prefix=WIN64 -; %a is passed indirectly on Win64. -; WIN64: movss 12(%rcx), %xmm0 - -define float @foo(<8 x float> %a) nounwind { - %c = extractelement <8 x float> %a, i32 3 - ret float %c -} diff --git a/test/CodeGen/X86/peep-vector-extract-insert.ll b/test/CodeGen/X86/peep-vector-extract-insert.ll deleted file mode 100644 index f958b6b2c069..000000000000 --- a/test/CodeGen/X86/peep-vector-extract-insert.ll +++ /dev/null @@ -1,12 +0,0 @@ -; RUN: llc < %s -march=x86-64 | grep "xorps %xmm0, %xmm0" | count 2 - -define float @foo(<4 x float> %a) { - %b = insertelement <4 x float> %a, float 0.0, i32 3 - %c = extractelement <4 x float> %b, i32 3 - ret float %c -} -define float @bar(float %a) { - %b = insertelement <4 x float> <float 0x400B333340000000, float 4.5, float 0.0, float 0x4022666660000000>, float %a, i32 3 - %c = extractelement <4 x float> %b, i32 2 - ret float %c -} diff --git a/test/CodeGen/X86/peephole-fold-movsd.ll b/test/CodeGen/X86/peephole-fold-movsd.ll new file mode 100644 index 000000000000..09d9328815da --- /dev/null +++ b/test/CodeGen/X86/peephole-fold-movsd.ll @@ -0,0 +1,31 @@ +; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s +; +; Check that x86's peephole optimization doesn't fold a 64-bit load (movsd) into +; addpd. +; rdar://problem/18236850 + +%struct.S1 = type { double, double } + +@g = common global %struct.S1 zeroinitializer, align 8 + +declare void @foo3(%struct.S1*) + +; CHECK: movsd {{[0-9]*}}(%rsp), [[R0:%xmm[0-9]+]] +; CHECK: addpd [[R0]], %xmm{{[0-9]+}} + +define void @foo1(double %a.coerce0, double %a.coerce1, double %b.coerce0, double %b.coerce1) { + %1 = alloca <2 x double>, align 16 + %tmpcast = bitcast <2 x double>* %1 to %struct.S1* + call void @foo3(%struct.S1* %tmpcast) #2 + %p2 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 0 + %2 = load double* %p2, align 16 + %p3 = getelementptr inbounds %struct.S1* %tmpcast, i64 0, i32 1 + %3 = load double* %p3, align 8 + %4 = insertelement <2 x double> undef, double %2, i32 0 + %5 = insertelement <2 x double> %4, double 0.000000e+00, i32 1 + %6 = insertelement <2 x double> undef, double %3, i32 1 + %7 = insertelement <2 x double> %6, double 1.000000e+00, i32 0 + %8 = fadd <2 x double> %5, %7 + store <2 x double> %8, <2 x double>* bitcast (%struct.S1* @g to <2 x double>*), align 16 + ret void +} diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll index 6eb97c3cd7ab..12a3adfdfe98 100644 --- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll +++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s ; rdar://5571034 ; This requires physreg joining, %vreg13 is live everywhere: diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 7bf8a618fa77..8937d6afa0ae 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -1,32 +1,96 @@ -; RUN: llc < %s -march=x86 -mattr=sse4.1 -mcpu=nehalem -stack-alignment=16 > %t -; RUN: grep pmul %t | count 12 -; RUN: grep mov %t | count 14 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41 define <4 x i32> @a(<4 x i32> %i) nounwind { - %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 > - ret <4 x i32> %A +; SSE2-LABEL: a: +; SSE2: movdqa {{.*}}, %[[X1:xmm[0-9]+]] +; SSE2-NEXT: pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %[[X1]], %xmm0 +; SSE2-NEXT: pmuludq %[[X1]], %[[X2]] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSE41-LABEL: a: +; SSE41: pmulld +; SSE41-NEXT: retq +entry: + %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 > + ret <4 x i32> %A } + define <2 x i64> @b(<2 x i64> %i) nounwind { - %A = mul <2 x i64> %i, < i64 117, i64 117 > - ret <2 x i64> %A +; ALL-LABEL: b: +; ALL: pmuludq +; ALL: pmuludq +; ALL: pmuludq +entry: + %A = mul <2 x i64> %i, < i64 117, i64 117 > + ret <2 x i64> %A } + define <4 x i32> @c(<4 x i32> %i, <4 x i32> %j) nounwind { - %A = mul <4 x i32> %i, %j - ret <4 x i32> %A +; SSE2-LABEL: c: +; SSE2: pshufd {{.*}} # [[X2:xmm[0-9]+]] = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %[[X2]], %xmm1 +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],xmm1[0,2] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSE41-LABEL: c: +; SSE41: pmulld +; SSE41-NEXT: retq +entry: + %A = mul <4 x i32> %i, %j + ret <4 x i32> %A } + define <2 x i64> @d(<2 x i64> %i, <2 x i64> %j) nounwind { - %A = mul <2 x i64> %i, %j - ret <2 x i64> %A +; ALL-LABEL: d: +; ALL: pmuludq +; ALL: pmuludq +; ALL: pmuludq +entry: + %A = mul <2 x i64> %i, %j + ret <2 x i64> %A } -; Use a call to force spills. + declare void @foo() + define <4 x i32> @e(<4 x i32> %i, <4 x i32> %j) nounwind { - call void @foo() - %A = mul <4 x i32> %i, %j - ret <4 x i32> %A +; SSE2-LABEL: e: +; SSE2: movdqa {{[0-9]*}}(%rsp), %xmm0 +; SSE2-NEXT: pshufd {{.*}} # [[X1:xmm[0-9]+]] = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa {{[0-9]*}}(%rsp), %[[X2:xmm[0-9]+]] +; SSE2-NEXT: pmuludq %[[X2]], %xmm0 +; SSE2-NEXT: pshufd {{.*}} # [[X2]] = [[X2]][1,1,3,3] +; SSE2-NEXT: pmuludq %[[X1]], %[[X2]] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],[[X2]][0,2] +; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: addq ${{[0-9]+}}, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: e: +; SSE41: pmulld {{[0-9]+}}(%rsp), %xmm +; SSE41-NEXT: addq ${{[0-9]+}}, %rsp +; SSE41-NEXT: retq +entry: + ; Use a call to force spills. + call void @foo() + %A = mul <4 x i32> %i, %j + ret <4 x i32> %A } + define <2 x i64> @f(<2 x i64> %i, <2 x i64> %j) nounwind { - call void @foo() - %A = mul <2 x i64> %i, %j - ret <2 x i64> %A +; ALL-LABEL: f: +; ALL: pmuludq +; ALL: pmuludq +; ALL: pmuludq +entry: + ; Use a call to force spills. + call void @foo() + %A = mul <2 x i64> %i, %j + ret <2 x i64> %A } diff --git a/test/CodeGen/X86/pr11334.ll b/test/CodeGen/X86/pr11334.ll index e7e29e0d609c..0bdb0ec7cf44 100644 --- a/test/CodeGen/X86/pr11334.ll +++ b/test/CodeGen/X86/pr11334.ll @@ -15,7 +15,7 @@ define <3 x double> @v3f2d_ext_vec(<3 x float> %v1) nounwind { entry: ; CHECK: v3f2d_ext_vec ; CHECK: cvtps2pd -; CHECK: movhlps +; CHECK: shufpd ; CHECK: cvtps2pd ; AVX: v3f2d_ext_vec ; AVX: vcvtps2pd @@ -28,7 +28,7 @@ define <4 x double> @v4f2d_ext_vec(<4 x float> %v1) nounwind { entry: ; CHECK: v4f2d_ext_vec ; CHECK: cvtps2pd -; CHECK: movhlps +; CHECK: shufpd ; CHECK: cvtps2pd ; AVX: v4f2d_ext_vec ; AVX: vcvtps2pd @@ -42,9 +42,9 @@ entry: ; CHECK: v8f2d_ext_vec ; CHECK: cvtps2pd ; CHECK: cvtps2pd -; CHECK: movhlps +; CHECK: shufpd ; CHECK: cvtps2pd -; CHECK: movhlps +; CHECK: shufpd ; CHECK: cvtps2pd ; AVX: v8f2d_ext_vec ; AVX: vcvtps2pd diff --git a/test/CodeGen/X86/pr11468.ll b/test/CodeGen/X86/pr11468.ll index f7e9adb4a211..f721df11586b 100644 --- a/test/CodeGen/X86/pr11468.ll +++ b/test/CodeGen/X86/pr11468.ll @@ -29,5 +29,5 @@ entry: ; CHECK: popq %rbp } -!0 = metadata !{i32 125} +!0 = !{i32 125} diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll deleted file mode 100644 index 024b163fa718..000000000000 --- a/test/CodeGen/X86/pr12359.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s -define <16 x i8> @shuf(<16 x i8> %inval1) { -entry: - %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4, i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4> - ret <16 x i8> %0 -; CHECK: shuf -; CHECK: # BB#0: # %entry -; CHECK-NEXT: pshufb -; CHECK-NEXT: ret -} diff --git a/test/CodeGen/X86/pr12360.ll b/test/CodeGen/X86/pr12360.ll index 8b30596cd8ac..673403624589 100644 --- a/test/CodeGen/X86/pr12360.ll +++ b/test/CodeGen/X86/pr12360.ll @@ -22,7 +22,7 @@ entry: ret i1 %tobool } -!0 = metadata !{i8 0, i8 2} +!0 = !{i8 0, i8 2} ; check that we don't build a "trunc" from i1 to i1, which would assert. diff --git a/test/CodeGen/X86/pr14161.ll b/test/CodeGen/X86/pr14161.ll index ff4532eac3ac..c2bb8d3df8f3 100644 --- a/test/CodeGen/X86/pr14161.ll +++ b/test/CodeGen/X86/pr14161.ll @@ -3,6 +3,12 @@ declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) define <2 x i16> @good(<4 x i32>*, <4 x i8>*) { +; CHECK-LABEL: good: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movdqa (%rdi), %xmm0 +; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pmovzxwq %xmm0, %xmm0 +; CHECK-NEXT: retq entry: %2 = load <4 x i32>* %0, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>) @@ -13,13 +19,17 @@ entry: %8 = bitcast i32 %4 to <2 x i16> %9 = bitcast i32 %5 to <2 x i16> ret <2 x i16> %8 -; CHECK: good -; CHECK: pminud -; CHECK-NEXT: pmovzxwq -; CHECK: ret } define <2 x i16> @bad(<4 x i32>*, <4 x i8>*) { +; CHECK-LABEL: bad: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movdqa (%rdi), %xmm0 +; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0 +; CHECK-NEXT: pextrd $1, %xmm0, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: pmovzxwq %xmm0, %xmm0 +; CHECK-NEXT: retq entry: %2 = load <4 x i32>* %0, align 16 %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>) @@ -30,9 +40,4 @@ entry: %8 = bitcast i32 %4 to <2 x i16> %9 = bitcast i32 %5 to <2 x i16> ret <2 x i16> %9 -; CHECK: bad -; CHECK: pminud -; CHECK: pextrd -; CHECK: pmovzxwq -; CHECK: ret } diff --git a/test/CodeGen/X86/pr15267.ll b/test/CodeGen/X86/pr15267.ll index c8aaf327a7dd..b4dc5fd47168 100644 --- a/test/CodeGen/X86/pr15267.ll +++ b/test/CodeGen/X86/pr15267.ll @@ -48,19 +48,22 @@ define <4 x i64> @test3(<4 x i1>* %in) nounwind { ; CHECK: test3 ; CHECK: movzbl -; CHECK: shrl -; CHECK: andl $1 -; CHECK: andl $1 -; CHECK: vmovd -; CHECK: pinsrd $1 -; CHECK: shrl $2 -; CHECK: andl $1 -; CHECK: pinsrd $2 -; CHECK: shrl $3 -; CHECK: andl $1 -; CHECK: pinsrd $3 -; CHECK: pslld -; CHECK: psrad -; CHECK: pmovsxdq -; CHECK: pmovsxdq +; CHECK: movq +; CHECK: shlq +; CHECK: sarq +; CHECK: vmovq +; CHECK: movq +; CHECK: shlq +; CHECK: sarq +; CHECK: vmovq +; CHECK: vpunpcklqdq +; CHECK: movq +; CHECK: shlq +; CHECK: sarq +; CHECK: vmovq +; CHECK: shlq +; CHECK: sarq +; CHECK: vmovq +; CHECK: vpunpcklqdq +; CHECK: vinsertf128 ; CHECK: ret diff --git a/test/CodeGen/X86/pr18846.ll b/test/CodeGen/X86/pr18846.ll new file mode 100644 index 000000000000..c65bc79d6813 --- /dev/null +++ b/test/CodeGen/X86/pr18846.ll @@ -0,0 +1,139 @@ +; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; pr18846 - needless avx spill/reload +; Test for unnecessary repeated spills due to eliminateRedundantSpills failing +; to recognise unaligned ymm load/stores to the stack. +; Bugpoint reduced testcase. + +;CHECK-LABEL: _Z16opt_kernel_cachePfS_S_ +;CHECK-NOT: vmovups {{.*#+}} 32-byte Folded Spill +;CHECK-NOT: vmovups {{.*#+}} 32-byte Folded Reload + +; Function Attrs: uwtable +define void @_Z16opt_kernel_cachePfS_S_() #0 { +entry: + br label %for.body29 + +for.body29: ; preds = %for.body29, %entry + br i1 undef, label %for.body29, label %for.body65 + +for.body65: ; preds = %for.body29 + %0 = load float* undef, align 4, !tbaa !1 + %vecinit7.i4448 = insertelement <8 x float> undef, float %0, i32 7 + %1 = load float* null, align 4, !tbaa !1 + %vecinit7.i4304 = insertelement <8 x float> undef, float %1, i32 7 + %2 = load float* undef, align 4, !tbaa !1 + %vecinit7.i4196 = insertelement <8 x float> undef, float %2, i32 7 + %3 = or i64 0, 16 + %add.ptr111.sum4096 = add i64 %3, 0 + %4 = load <8 x float>* null, align 16, !tbaa !5 + %add.ptr162 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr111.sum4096 + %__v.i4158 = bitcast float* %add.ptr162 to <8 x float>* + %5 = load <8 x float>* %__v.i4158, align 16, !tbaa !5 + %add.ptr158.sum40975066 = or i64 %add.ptr111.sum4096, 8 + %add.ptr183 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr158.sum40975066 + %__v.i4162 = bitcast float* %add.ptr183 to <8 x float>* + %6 = load <8 x float>* %__v.i4162, align 16, !tbaa !5 + %add.ptr200.sum40995067 = or i64 undef, 8 + %add.ptr225 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr200.sum40995067 + %__v.i4167 = bitcast float* %add.ptr225 to <8 x float>* + %7 = load <8 x float>* %__v.i4167, align 4, !tbaa !5 + %8 = load <8 x float>* undef, align 16, !tbaa !5 + %add.ptr242.sum41015068 = or i64 0, 8 + %add.ptr267 = getelementptr inbounds [65536 x float]* null, i64 0, i64 %add.ptr242.sum41015068 + %__v.i4171 = bitcast float* %add.ptr267 to <8 x float>* + %9 = load <8 x float>* %__v.i4171, align 4, !tbaa !5 + %mul.i4690 = fmul <8 x float> %7, undef + %add.i4665 = fadd <8 x float> undef, undef + %mul.i4616 = fmul <8 x float> %8, undef + %mul.i4598 = fmul <8 x float> undef, undef + %add.i4597 = fadd <8 x float> undef, %mul.i4598 + %mul.i4594 = fmul <8 x float> %6, undef + %add.i4593 = fadd <8 x float> undef, %mul.i4594 + %mul.i4578 = fmul <8 x float> %9, undef + %add.i4577 = fadd <8 x float> %add.i4593, %mul.i4578 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4577) #1 + %10 = load <8 x float>* null, align 16, !tbaa !5 + %11 = load <8 x float>* undef, align 16, !tbaa !5 + %mul.i4564 = fmul <8 x float> %4, undef + %add.i4563 = fadd <8 x float> %10, %mul.i4564 + %mul.i4560 = fmul <8 x float> %5, undef + %add.i4559 = fadd <8 x float> %11, %mul.i4560 + %add.i4547 = fadd <8 x float> %add.i4563, undef + %mul.i4546 = fmul <8 x float> %7, undef + %add.i4545 = fadd <8 x float> undef, %mul.i4546 + %mul.i4544 = fmul <8 x float> %8, undef + %add.i4543 = fadd <8 x float> %add.i4559, %mul.i4544 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4547) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4545) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4543) #1 + %add.i4455 = fadd <8 x float> undef, undef + %mul.i4454 = fmul <8 x float> undef, undef + %add.i4453 = fadd <8 x float> undef, %mul.i4454 + %mul.i4440 = fmul <8 x float> zeroinitializer, %vecinit7.i4448 + %add.i4439 = fadd <8 x float> %add.i4455, %mul.i4440 + %mul.i4438 = fmul <8 x float> %7, %vecinit7.i4448 + %add.i4437 = fadd <8 x float> %add.i4453, %mul.i4438 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4439) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4437) #1 + %add.i4413 = fadd <8 x float> zeroinitializer, undef + %mul.i4400 = fmul <8 x float> %8, undef + %add.i4399 = fadd <8 x float> undef, %mul.i4400 + %add.i4397 = fadd <8 x float> %add.i4413, zeroinitializer + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> zeroinitializer) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4399) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4397) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1 + %mul.i4330 = fmul <8 x float> %7, undef + %add.i4329 = fadd <8 x float> undef, %mul.i4330 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4329) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> undef) #1 + %mul.i4312 = fmul <8 x float> %4, undef + %add.i4311 = fadd <8 x float> undef, %mul.i4312 + %mul.i4306 = fmul <8 x float> %6, undef + %add.i4305 = fadd <8 x float> undef, %mul.i4306 + %add.i4295 = fadd <8 x float> %add.i4311, undef + %mul.i4294 = fmul <8 x float> %7, %vecinit7.i4304 + %add.i4293 = fadd <8 x float> undef, %mul.i4294 + %mul.i4292 = fmul <8 x float> %8, %vecinit7.i4304 + %add.i4291 = fadd <8 x float> undef, %mul.i4292 + %mul.i4290 = fmul <8 x float> %9, %vecinit7.i4304 + %add.i4289 = fadd <8 x float> %add.i4305, %mul.i4290 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4295) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4293) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4291) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4289) #1 + %12 = load <8 x float>* undef, align 16, !tbaa !5 + %mul.i4274 = fmul <8 x float> undef, undef + %add.i4273 = fadd <8 x float> %12, %mul.i4274 + %mul.i4258 = fmul <8 x float> %7, undef + %add.i4257 = fadd <8 x float> %add.i4273, %mul.i4258 + %mul.i4254 = fmul <8 x float> %9, undef + %add.i4253 = fadd <8 x float> undef, %mul.i4254 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4257) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i4253) #1 + %mul.i = fmul <8 x float> %9, %vecinit7.i4196 + %add.i = fadd <8 x float> undef, %mul.i + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> zeroinitializer) #1 + call void @llvm.x86.avx.storeu.ps.256(i8* undef, <8 x float> %add.i) #1 + unreachable +} + +; Function Attrs: nounwind +declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) #1 + +attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.5 "} +!1 = !{!2, !2, i64 0} +!2 = !{!"float", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} +!5 = !{!3, !3, i64 0} diff --git a/test/CodeGen/X86/pr21099.ll b/test/CodeGen/X86/pr21099.ll new file mode 100644 index 000000000000..07292c125eea --- /dev/null +++ b/test/CodeGen/X86/pr21099.ll @@ -0,0 +1,10 @@ +; RUN: llc < %s -O2 -march=x86-64 -verify-machineinstrs | FileCheck %s + +define void @pr21099(i64* %p) { +; CHECK-LABEL: pr21099 +; CHECK: lock +; CHECK-NEXT: addq $-2147483648 +; This number is INT32_MIN: 0x80000000UL + %1 = atomicrmw add i64* %p, i64 -2147483648 seq_cst + ret void +} diff --git a/test/CodeGen/X86/pr21529.ll b/test/CodeGen/X86/pr21529.ll new file mode 100644 index 000000000000..655bc844f503 --- /dev/null +++ b/test/CodeGen/X86/pr21529.ll @@ -0,0 +1,15 @@ +; RUN: llc -show-mc-encoding < %s | FileCheck %s + +; Test that the direct object emission selects the and variant with 8 bit +; immediate. +; We used to get this wrong when using direct object emission, but not when +; reading assembly. + +; CHECK: andq $-32, %rsp # encoding: [0x48,0x83,0xe4,0xe0] + +target triple = "x86_64-pc-linux" + +define void @f() { + %foo = alloca i8, align 32 + ret void +} diff --git a/test/CodeGen/X86/pr22019.ll b/test/CodeGen/X86/pr22019.ll new file mode 100644 index 000000000000..4cee5d704d3a --- /dev/null +++ b/test/CodeGen/X86/pr22019.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +module asm "pselect = __pselect" +module asm "var = __var" +module asm "alias = __alias" +; CHECK: pselect = __pselect +; CHECK: var = __var +; CHECK: alias = __alias + +; CHECK: pselect: +; CHECK: retq +define void @pselect() { + ret void +} + +; CHECK: var: +; CHECK: .long 0 +@var = global i32 0 + +; CHECK: alias = var +@alias = alias i32* @var diff --git a/test/CodeGen/X86/pr22103.ll b/test/CodeGen/X86/pr22103.ll new file mode 100644 index 000000000000..77c0751e219d --- /dev/null +++ b/test/CodeGen/X86/pr22103.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s | FileCheck %s +; Don't try to emit a direct call through a TLS global. +; This fixes PR22103 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = external thread_local global i64 + +; Function Attrs: nounwind +define void @_Z1fv() { +; CHECK-NOT: callq *$a +; CHECK: movq %fs:0, [[RAX:%r..]] +; CHECK-NEXT: addq a@GOTTPOFF(%rip), [[RAX]] +; CHECK-NEXT: callq *[[RAX]] +entry: + call void bitcast (i64* @a to void ()*)() + ret void +} diff --git a/test/CodeGen/X86/pre-ra-sched.ll b/test/CodeGen/X86/pre-ra-sched.ll index 70135d43f49b..bb4c1269b7cf 100644 --- a/test/CodeGen/X86/pre-ra-sched.ll +++ b/test/CodeGen/X86/pre-ra-sched.ll @@ -1,4 +1,4 @@ -; RUN-disabled: llc < %s -mtriple=x86_64-apple-macosx -pre-RA-sched=ilp -debug-only=pre-RA-sched \ +; RUN-disabled: llc < %s -verify-machineinstrs -mtriple=x86_64-apple-macosx -pre-RA-sched=ilp -debug-only=pre-RA-sched \ ; RUN-disabled: 2>&1 | FileCheck %s ; RUN: true ; REQUIRES: asserts diff --git a/test/CodeGen/X86/prefixdata.ll b/test/CodeGen/X86/prefixdata.ll index 2ec1892dd183..9bb54a2a3977 100644 --- a/test/CodeGen/X86/prefixdata.ll +++ b/test/CodeGen/X86/prefixdata.ll @@ -2,16 +2,17 @@ @i = linkonce_odr global i32 1 -; CHECK: f: -; CHECK-NEXT: .cfi_startproc +; CHECK: .type f,@function ; CHECK-NEXT: .long 1 +; CHECK-NEXT: # 0x1 +; CHECK-NEXT: f: define void @f() prefix i32 1 { ret void } -; CHECK: g: -; CHECK-NEXT: .cfi_startproc +; CHECK: .type g,@function ; CHECK-NEXT: .quad i +; CHECK-NEXT: g: define void @g() prefix i32* @i { ret void } diff --git a/test/CodeGen/X86/prologuedata.ll b/test/CodeGen/X86/prologuedata.ll new file mode 100644 index 000000000000..6a50ddbfd140 --- /dev/null +++ b/test/CodeGen/X86/prologuedata.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +@i = linkonce_odr global i32 1 + +; CHECK: f: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .long 1 +define void @f() prologue i32 1 { + ret void +} + +; CHECK: g: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .quad i +define void @g() prologue i32* @i { + ret void +} diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll new file mode 100644 index 000000000000..303c4a684761 --- /dev/null +++ b/test/CodeGen/X86/pshufb-mask-comments.ll @@ -0,0 +1,40 @@ +; RUN: llc < %s -march=x86-64 -mattr=+ssse3 | FileCheck %s + +; Test that the pshufb mask comment is correct. + +define <16 x i8> @test1(<16 x i8> %V) { +; CHECK-LABEL: test1: +; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,0,0,2,0,0,0,0,3,0,0,0,0,4] + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 2, i8 0, i8 0, i8 0, i8 0, i8 3, i8 0, i8 0, i8 0, i8 0, i8 4>) + ret <16 x i8> %1 +} + +; Test that indexes larger than the size of the vector are shown masked (bottom 4 bits). + +define <16 x i8> @test2(<16 x i8> %V) { +; CHECK-LABEL: test2: +; CHECK: pshufb {{.*}}# xmm0 = xmm0[15,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2] + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 15, i8 0, i8 0, i8 0, i8 0, i8 16, i8 0, i8 0, i8 0, i8 0, i8 17, i8 0, i8 0, i8 0, i8 0, i8 50>) + ret <16 x i8> %1 +} + +; Test that indexes with bit seven set are shown as zero. + +define <16 x i8> @test3(<16 x i8> %V) { +; CHECK-LABEL: test3: +; CHECK: pshufb {{.*}}# xmm0 = xmm0[1,0,0,15,0,2,0,0],zero,xmm0[0,3,0,0],zero,xmm0[0,4] + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 127, i8 0, i8 2, i8 0, i8 0, i8 128, i8 0, i8 3, i8 0, i8 0, i8 255, i8 0, i8 4>) + ret <16 x i8> %1 +} + +; Test that we won't crash when the constant was reused for another instruction. + +define <16 x i8> @test4(<2 x i64>* %V) { +; CHECK-LABEL: test4 +; CHECK: pshufb {{.*}} + store <2 x i64> <i64 1084818905618843912, i64 506097522914230528>, <2 x i64>* %V, align 16 + %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> undef, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) + ret <16 x i8> %1 +} + +declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone diff --git a/test/CodeGen/X86/ragreedy-bug.ll b/test/CodeGen/X86/ragreedy-bug.ll index df9b41d6e90b..83ac274bba19 100644 --- a/test/CodeGen/X86/ragreedy-bug.ll +++ b/test/CodeGen/X86/ragreedy-bug.ll @@ -266,27 +266,27 @@ return: %retval.0 = phi i32 [ 0, %entry ], [ 1, %land.lhs.true52 ], [ 1, %land.lhs.true43 ], [ 0, %if.else123 ], [ 1, %while.cond59.preheader ], [ 1, %while.cond95.preheader ], [ 1, %while.cond130.preheader ], [ 1, %land.lhs.true28 ], [ 1, %if.then83 ], [ 0, %lor.lhs.false74 ], [ 1, %land.rhs ], [ 1, %if.then117 ], [ 0, %while.body104 ], [ 1, %land.rhs99 ], [ 1, %if.then152 ], [ 0, %while.body139 ], [ 1, %land.rhs134 ], [ 0, %while.body ] ret i32 %retval.0 } -!181 = metadata !{metadata !"branch_weights", i32 662038, i32 1} -!988 = metadata !{metadata !"branch_weights", i32 12091450, i32 1916} -!989 = metadata !{metadata !"branch_weights", i32 7564670, i32 4526781} -!990 = metadata !{metadata !"branch_weights", i32 7484958, i32 13283499} -!991 = metadata !{metadata !"branch_weights", i32 8677007, i32 4606493} -!992 = metadata !{metadata !"branch_weights", i32 -1172426948, i32 145094705} -!993 = metadata !{metadata !"branch_weights", i32 1468914, i32 5683688} -!994 = metadata !{metadata !"branch_weights", i32 114025221, i32 -1217548794, i32 -1199521551, i32 87712616} -!995 = metadata !{metadata !"branch_weights", i32 1853716452, i32 -444717951, i32 932776759} -!996 = metadata !{metadata !"branch_weights", i32 1004870, i32 20259} -!997 = metadata !{metadata !"branch_weights", i32 20071, i32 189} -!998 = metadata !{metadata !"branch_weights", i32 -1020255939, i32 572177766} -!999 = metadata !{metadata !"branch_weights", i32 2666513, i32 3466431} -!1000 = metadata !{metadata !"branch_weights", i32 5117635, i32 1859780} -!1001 = metadata !{metadata !"branch_weights", i32 354902465, i32 -1444604407} -!1002 = metadata !{metadata !"branch_weights", i32 -1762419279, i32 1592770684} -!1003 = metadata !{metadata !"branch_weights", i32 1435905930, i32 -1951930624} -!1004 = metadata !{metadata !"branch_weights", i32 1, i32 504888} -!1005 = metadata !{metadata !"branch_weights", i32 94662, i32 504888} -!1006 = metadata !{metadata !"branch_weights", i32 -1897793104, i32 160196332} -!1007 = metadata !{metadata !"branch_weights", i32 2074643678, i32 -29579071} -!1008 = metadata !{metadata !"branch_weights", i32 1, i32 226163} -!1009 = metadata !{metadata !"branch_weights", i32 58357, i32 226163} -!1010 = metadata !{metadata !"branch_weights", i32 -2072848646, i32 92907517} +!181 = !{!"branch_weights", i32 662038, i32 1} +!988 = !{!"branch_weights", i32 12091450, i32 1916} +!989 = !{!"branch_weights", i32 7564670, i32 4526781} +!990 = !{!"branch_weights", i32 7484958, i32 13283499} +!991 = !{!"branch_weights", i32 8677007, i32 4606493} +!992 = !{!"branch_weights", i32 -1172426948, i32 145094705} +!993 = !{!"branch_weights", i32 1468914, i32 5683688} +!994 = !{!"branch_weights", i32 114025221, i32 -1217548794, i32 -1199521551, i32 87712616} +!995 = !{!"branch_weights", i32 1853716452, i32 -444717951, i32 932776759} +!996 = !{!"branch_weights", i32 1004870, i32 20259} +!997 = !{!"branch_weights", i32 20071, i32 189} +!998 = !{!"branch_weights", i32 -1020255939, i32 572177766} +!999 = !{!"branch_weights", i32 2666513, i32 3466431} +!1000 = !{!"branch_weights", i32 5117635, i32 1859780} +!1001 = !{!"branch_weights", i32 354902465, i32 -1444604407} +!1002 = !{!"branch_weights", i32 -1762419279, i32 1592770684} +!1003 = !{!"branch_weights", i32 1435905930, i32 -1951930624} +!1004 = !{!"branch_weights", i32 1, i32 504888} +!1005 = !{!"branch_weights", i32 94662, i32 504888} +!1006 = !{!"branch_weights", i32 -1897793104, i32 160196332} +!1007 = !{!"branch_weights", i32 2074643678, i32 -29579071} +!1008 = !{!"branch_weights", i32 1, i32 226163} +!1009 = !{!"branch_weights", i32 58357, i32 226163} +!1010 = !{!"branch_weights", i32 -2072848646, i32 92907517} diff --git a/test/CodeGen/X86/ragreedy-hoist-spill.ll b/test/CodeGen/X86/ragreedy-hoist-spill.ll index c6b28f71af46..57afb4152db5 100644 --- a/test/CodeGen/X86/ragreedy-hoist-spill.ll +++ b/test/CodeGen/X86/ragreedy-hoist-spill.ll @@ -202,7 +202,6 @@ lor.rhs500: ; CHECK: lor.rhs500 ; Make sure that we don't hoist the spill to outer loops. ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp) - ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp) ; CHECK: callq {{.*}}maskrune %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256) br i1 undef, label %land.lhs.true504, label %do.body479.backedge @@ -378,12 +377,12 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) !llvm.ident = !{!0} -!0 = metadata !{metadata !"clang version 3.5.0 (trunk 204257)"} -!1 = metadata !{metadata !2, metadata !2, i64 0} -!2 = metadata !{metadata !"int", metadata !3, i64 0} -!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0} -!4 = metadata !{metadata !"Simple C/C++ TBAA"} -!5 = metadata !{metadata !3, metadata !3, i64 0} -!6 = metadata !{metadata !7, metadata !8, i64 8} -!7 = metadata !{metadata !"", metadata !8, i64 0, metadata !8, i64 8, metadata !3, i64 16} -!8 = metadata !{metadata !"any pointer", metadata !3, i64 0} +!0 = !{!"clang version 3.5.0 (trunk 204257)"} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} +!5 = !{!3, !3, i64 0} +!6 = !{!7, !8, i64 8} +!7 = !{!"", !8, i64 0, !8, i64 8, !3, i64 16} +!8 = !{!"any pointer", !3, i64 0} diff --git a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll index d8e45727b9d2..49d58f437c21 100644 --- a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll +++ b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll @@ -2,10 +2,12 @@ ; Without the last chance recoloring, this test fails with: ; "ran out of registers". -; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-depth=0 < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEPTH +; NOTE: With the fix to PR18883, we don't actually run out of registers here +; any more, and so those checks are disabled. This test remains only for general coverage. +; XXX: not llc -regalloc=greedy -relocation-model=pic -lcr-max-depth=0 < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEPTH ; Test whether failure due to cutoff for depth is reported -; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 < %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTERF +; XXX: not llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 < %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTERF ; Test whether failure due to cutoff for interference is reported ; RUN: llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 -lcr-max-depth=0 -exhaustive-register-search < %s > %t 2>&1 diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll new file mode 100644 index 000000000000..83b86accdb38 --- /dev/null +++ b/test/CodeGen/X86/recip-fastmath.ll @@ -0,0 +1,109 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+use-recip-est,+avx -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE + +; If the target's divss/divps instructions are substantially +; slower than rcpss/rcpps with a Newton-Raphson refinement, +; we should generate the estimate sequence. + +; See PR21385 ( http://llvm.org/bugs/show_bug.cgi?id=21385 ) +; for details about the accuracy, speed, and implementation +; differences of x86 reciprocal estimates. + +define float @reciprocal_estimate(float %x) #0 { + %div = fdiv fast float 1.0, %x + ret float %div + +; CHECK-LABEL: reciprocal_estimate: +; CHECK: movss +; CHECK-NEXT: divss +; CHECK-NEXT: movaps +; CHECK-NEXT: retq + +; BTVER2-LABEL: reciprocal_estimate: +; BTVER2: vrcpss +; BTVER2: vmulss +; BTVER2: vsubss +; BTVER2: vmulss +; BTVER2: vaddss +; BTVER2-NEXT: retq + +; REFINE-LABEL: reciprocal_estimate: +; REFINE: vrcpss +; REFINE: vmulss +; REFINE: vsubss +; REFINE: vmulss +; REFINE: vaddss +; REFINE: vmulss +; REFINE: vsubss +; REFINE: vmulss +; REFINE: vaddss +; REFINE-NEXT: retq +} + +define <4 x float> @reciprocal_estimate_v4f32(<4 x float> %x) #0 { + %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x + ret <4 x float> %div + +; CHECK-LABEL: reciprocal_estimate_v4f32: +; CHECK: movaps +; CHECK-NEXT: divps +; CHECK-NEXT: movaps +; CHECK-NEXT: retq + +; BTVER2-LABEL: reciprocal_estimate_v4f32: +; BTVER2: vrcpps +; BTVER2: vmulps +; BTVER2: vsubps +; BTVER2: vmulps +; BTVER2: vaddps +; BTVER2-NEXT: retq + +; REFINE-LABEL: reciprocal_estimate_v4f32: +; REFINE: vrcpps +; REFINE: vmulps +; REFINE: vsubps +; REFINE: vmulps +; REFINE: vaddps +; REFINE: vmulps +; REFINE: vsubps +; REFINE: vmulps +; REFINE: vaddps +; REFINE-NEXT: retq +} + +define <8 x float> @reciprocal_estimate_v8f32(<8 x float> %x) #0 { + %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x + ret <8 x float> %div + +; CHECK-LABEL: reciprocal_estimate_v8f32: +; CHECK: movaps +; CHECK: movaps +; CHECK-NEXT: divps +; CHECK-NEXT: divps +; CHECK-NEXT: movaps +; CHECK-NEXT: movaps +; CHECK-NEXT: retq + +; BTVER2-LABEL: reciprocal_estimate_v8f32: +; BTVER2: vrcpps +; BTVER2: vmulps +; BTVER2: vsubps +; BTVER2: vmulps +; BTVER2: vaddps +; BTVER2-NEXT: retq + +; REFINE-LABEL: reciprocal_estimate_v8f32: +; REFINE: vrcpps +; REFINE: vmulps +; REFINE: vsubps +; REFINE: vmulps +; REFINE: vaddps +; REFINE: vmulps +; REFINE: vsubps +; REFINE: vmulps +; REFINE: vaddps +; REFINE-NEXT: retq +} + +attributes #0 = { "unsafe-fp-math"="true" } diff --git a/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll new file mode 100644 index 000000000000..00679428ca63 --- /dev/null +++ b/test/CodeGen/X86/regalloc-reconcile-broken-hints.ll @@ -0,0 +1,145 @@ +; RUN: llc < %s -o - -mtriple=x86_64-apple-macosx | FileCheck %s +; Test case for the recoloring of broken hints. +; This is tricky to have something reasonably small to kick this optimization since +; it requires that spliting and spilling occur. +; The bottom line is that this test case is fragile. +; This was reduced from the make_list function from the llvm-testsuite: +; SingleSource/Benchmarks/McGill/chomp.c + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.9.0" + +%struct._list = type { i32*, %struct._list* } + +@ncol = external global i32, align 4 +@nrow = external global i32, align 4 + +declare noalias i32* @copy_data() + +declare noalias i8* @malloc(i64) + +declare i32 @get_value() + +declare i32 @in_wanted(i32* nocapture readonly) + +declare noalias i32* @make_data() + +; CHECK-LABEL: make_list: +; Function prologue. +; CHECK: pushq +; CHECK: subq ${{[0-9]+}}, %rsp +; Move the first argument (%data) into a temporary register. +; It will not survive the call to malloc otherwise. +; CHECK: movq %rdi, [[ARG1:%r[0-9a-z]+]] +; CHECK: callq _malloc +; Compute %data - 1 as used for load in land.rhs.i (via the variable %indvars.iv.next.i). +; CHECK: addq $-4, [[ARG1]] +; We use to produce a useless copy here and move %data in another temporary register. +; CHECK-NOT: movq [[ARG1]] +; End of the first basic block. +; CHECK: .align +; Now check that %data is used in an address computation. +; CHECK: leaq ([[ARG1]] +define %struct._list* @make_list(i32* nocapture readonly %data, i32* nocapture %value, i32* nocapture %all) { +entry: + %call = tail call i8* @malloc(i64 16) + %next = getelementptr inbounds i8* %call, i64 8 + %tmp = bitcast i8* %next to %struct._list** + %tmp2 = bitcast i8* %call to %struct._list* + %.pre78 = load i32* @ncol, align 4 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.inc32, %entry + %tmp4 = phi i32 [ %.pre78, %entry ], [ 0, %for.inc32 ] + %current.077 = phi %struct._list* [ %tmp2, %entry ], [ %current.1.lcssa, %for.inc32 ] + %cmp270 = icmp eq i32 %tmp4, 0 + br i1 %cmp270, label %for.inc32, label %for.body3 + +for.body3: ; preds = %if.end31, %for.cond1.preheader + %current.173 = phi %struct._list* [ %current.2, %if.end31 ], [ %current.077, %for.cond1.preheader ] + %row.172 = phi i32 [ %row.3, %if.end31 ], [ 0, %for.cond1.preheader ] + %col.071 = phi i32 [ %inc, %if.end31 ], [ 0, %for.cond1.preheader ] + %call4 = tail call i32* @make_data() + %tmp5 = load i32* @ncol, align 4 + %tobool14.i = icmp eq i32 %tmp5, 0 + br i1 %tobool14.i, label %while.cond.i, label %while.body.lr.ph.i + +while.body.lr.ph.i: ; preds = %for.body3 + %tmp6 = sext i32 %tmp5 to i64 + br label %while.body.i + +while.body.i: ; preds = %while.body.i, %while.body.lr.ph.i + %indvars.iv.i = phi i64 [ %tmp6, %while.body.lr.ph.i ], [ %indvars.iv.next.i, %while.body.i ] + %indvars.iv.next.i = add nsw i64 %indvars.iv.i, -1 + %tmp9 = trunc i64 %indvars.iv.next.i to i32 + %tobool.i = icmp eq i32 %tmp9, 0 + br i1 %tobool.i, label %while.cond.i, label %while.body.i + +while.cond.i: ; preds = %land.rhs.i, %while.body.i, %for.body3 + %indvars.iv.i64 = phi i64 [ %indvars.iv.next.i65, %land.rhs.i ], [ 0, %for.body3 ], [ %tmp6, %while.body.i ] + %indvars.iv.next.i65 = add nsw i64 %indvars.iv.i64, -1 + %tmp10 = trunc i64 %indvars.iv.i64 to i32 + %tobool.i66 = icmp eq i32 %tmp10, 0 + br i1 %tobool.i66, label %if.else, label %land.rhs.i + +land.rhs.i: ; preds = %while.cond.i + %arrayidx.i67 = getelementptr inbounds i32* %call4, i64 %indvars.iv.next.i65 + %tmp11 = load i32* %arrayidx.i67, align 4 + %arrayidx2.i68 = getelementptr inbounds i32* %data, i64 %indvars.iv.next.i65 + %tmp12 = load i32* %arrayidx2.i68, align 4 + %cmp.i69 = icmp eq i32 %tmp11, %tmp12 + br i1 %cmp.i69, label %while.cond.i, label %equal_data.exit + +equal_data.exit: ; preds = %land.rhs.i + %cmp3.i = icmp slt i32 %tmp10, 1 + br i1 %cmp3.i, label %if.else, label %if.then + +if.then: ; preds = %equal_data.exit + %next7 = getelementptr inbounds %struct._list* %current.173, i64 0, i32 1 + %tmp14 = load %struct._list** %next7, align 8 + %next12 = getelementptr inbounds %struct._list* %tmp14, i64 0, i32 1 + store %struct._list* null, %struct._list** %next12, align 8 + %tmp15 = load %struct._list** %next7, align 8 + %tmp16 = load i32* %value, align 4 + %cmp14 = icmp eq i32 %tmp16, 1 + %.tmp16 = select i1 %cmp14, i32 0, i32 %tmp16 + %tmp18 = load i32* %all, align 4 + %tmp19 = or i32 %tmp18, %.tmp16 + %tmp20 = icmp eq i32 %tmp19, 0 + br i1 %tmp20, label %if.then19, label %if.end31 + +if.then19: ; preds = %if.then + %call21 = tail call i32 @in_wanted(i32* %call4) + br label %if.end31 + +if.else: ; preds = %equal_data.exit, %while.cond.i + %cmp26 = icmp eq i32 %col.071, 0 + %.row.172 = select i1 %cmp26, i32 0, i32 %row.172 + %sub30 = add nsw i32 %tmp5, -1 + br label %if.end31 + +if.end31: ; preds = %if.else, %if.then19, %if.then + %col.1 = phi i32 [ %sub30, %if.else ], [ 0, %if.then ], [ 0, %if.then19 ] + %row.3 = phi i32 [ %.row.172, %if.else ], [ %row.172, %if.then ], [ 0, %if.then19 ] + %current.2 = phi %struct._list* [ %current.173, %if.else ], [ %tmp15, %if.then ], [ %tmp15, %if.then19 ] + %inc = add nsw i32 %col.1, 1 + %tmp25 = load i32* @ncol, align 4 + %cmp2 = icmp eq i32 %inc, %tmp25 + br i1 %cmp2, label %for.cond1.for.inc32_crit_edge, label %for.body3 + +for.cond1.for.inc32_crit_edge: ; preds = %if.end31 + %.pre79 = load i32* @nrow, align 4 + br label %for.inc32 + +for.inc32: ; preds = %for.cond1.for.inc32_crit_edge, %for.cond1.preheader + %tmp26 = phi i32 [ %.pre79, %for.cond1.for.inc32_crit_edge ], [ 0, %for.cond1.preheader ] + %current.1.lcssa = phi %struct._list* [ %current.2, %for.cond1.for.inc32_crit_edge ], [ %current.077, %for.cond1.preheader ] + %row.1.lcssa = phi i32 [ %row.3, %for.cond1.for.inc32_crit_edge ], [ 0, %for.cond1.preheader ] + %inc33 = add nsw i32 %row.1.lcssa, 1 + %cmp = icmp eq i32 %inc33, %tmp26 + br i1 %cmp, label %for.end34, label %for.cond1.preheader + +for.end34: ; preds = %for.inc32 + %.pre = load %struct._list** %tmp, align 8 + ret %struct._list* %.pre +} diff --git a/test/CodeGen/X86/remat-phys-dead.ll b/test/CodeGen/X86/remat-phys-dead.ll index 4d7ee622a37e..6cdcd28eacd8 100644 --- a/test/CodeGen/X86/remat-phys-dead.ll +++ b/test/CodeGen/X86/remat-phys-dead.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: llc -mtriple=x86_64-apple-darwin -debug -o /dev/null < %s 2>&1 | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=x86_64-apple-darwin -debug -o /dev/null < %s 2>&1 | FileCheck %s ; We need to make sure that rematerialization into a physical register marks the ; super- or sub-register as dead after this rematerialization since only the diff --git a/test/CodeGen/X86/return_zeroext_i2.ll b/test/CodeGen/X86/return_zeroext_i2.ll new file mode 100644 index 000000000000..d535b0c41267 --- /dev/null +++ b/test/CodeGen/X86/return_zeroext_i2.ll @@ -0,0 +1,7 @@ +; RUN: llc -mtriple=i386-pc-win32 < %s | FileCheck %s +; Check that the testcase does not crash +define zeroext i2 @crash () { + ret i2 0 +} +; CHECK: xorl %eax, %eax +; CHECK-NEXT: retl diff --git a/test/CodeGen/X86/scev-interchange.ll b/test/CodeGen/X86/scev-interchange.ll index 71a4d21a9b95..0e7047b4845d 100644 --- a/test/CodeGen/X86/scev-interchange.ll +++ b/test/CodeGen/X86/scev-interchange.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86-64 +; RUN: llc < %s -mtriple=x86_64-linux target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" %"struct.DataOutBase::GmvFlags" = type { i32 } diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll index b82be41b8cbf..e34ba5412f07 100644 --- a/test/CodeGen/X86/segmented-stacks-dynamic.ll +++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll @@ -1,7 +1,9 @@ ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -verify-machineinstrs | FileCheck %s -check-prefix=X32ABI ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj ; Just to prevent the alloca from being optimized away declare void @dummy_use(i32*, i32) @@ -61,6 +63,26 @@ false: ; X64-NEXT: callq __morestack_allocate_stack_space ; X64: movq %rax, %rdi +; X32ABI-LABEL: test_basic: + +; X32ABI: cmpl %fs:64, %esp +; X32ABI-NEXT: ja .LBB0_2 + +; X32ABI: movl $24, %r10d +; X32ABI-NEXT: movl $0, %r11d +; X32ABI-NEXT: callq __morestack +; X32ABI-NEXT: ret + +; X32ABI: movl %esp, %[[EDI:edi|eax]] +; X32ABI: subl %{{.*}}, %[[EDI]] +; X32ABI-NEXT: cmpl %[[EDI]], %fs:64 + +; X32ABI: movl %[[EDI]], %esp + +; X32ABI: movl %{{.*}}, %edi +; X32ABI-NEXT: callq __morestack_allocate_stack_space +; X32ABI: movl %eax, %edi + } attributes #0 = { "split-stack" } diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll index 9dab3cd8d6d5..3e47121a380e 100644 --- a/test/CodeGen/X86/segmented-stacks.ll +++ b/test/CodeGen/X86/segmented-stacks.ll @@ -1,18 +1,25 @@ ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -code-model=large -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux-Large +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -verify-machineinstrs | FileCheck %s -check-prefix=X32ABI ; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X32-Darwin ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -verify-machineinstrs | FileCheck %s -check-prefix=X64-FreeBSD +; RUN: llc < %s -mcpu=generic -mtriple=i686-dragonfly -verify-machineinstrs | FileCheck %s -check-prefix=X32-DFlyBSD +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-dragonfly -verify-machineinstrs | FileCheck %s -check-prefix=X64-DFlyBSD ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X64-MinGW ; We used to crash with filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -filetype=obj +; RUN: llc < %s -mcpu=generic -mtriple=i686-dragonfly -filetype=obj +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-dragonfly -filetype=obj ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -filetype=obj ; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-solaris 2> %t.log @@ -51,6 +58,26 @@ define void @test_basic() #0 { ; X64-Linux-NEXT: callq __morestack ; X64-Linux-NEXT: ret +; X64-Linux-Large-LABEL: test_basic: + +; X64-Linux-Large: cmpq %fs:112, %rsp +; X64-Linux-Large-NEXT: ja .LBB0_2 + +; X64-Linux-Large: movabsq $40, %r10 +; X64-Linux-Large-NEXT: movabsq $0, %r11 +; X64-Linux-Large-NEXT: callq *__morestack_addr(%rip) +; X64-Linux-Large-NEXT: ret + +; X32ABI-LABEL: test_basic: + +; X32ABI: cmpl %fs:64, %esp +; X32ABI-NEXT: ja .LBB0_2 + +; X32ABI: movl $40, %r10d +; X32ABI-NEXT: movl $0, %r11d +; X32ABI-NEXT: callq __morestack +; X32ABI-NEXT: ret + ; X32-Darwin-LABEL: test_basic: ; X32-Darwin: movl $432, %ecx @@ -102,6 +129,26 @@ define void @test_basic() #0 { ; X64-FreeBSD-NEXT: callq __morestack ; X64-FreeBSD-NEXT: ret +; X32-DFlyBSD-LABEL: test_basic: + +; X32-DFlyBSD: cmpl %fs:16, %esp +; X32-DFlyBSD-NEXT: ja .LBB0_2 + +; X32-DFlyBSD: pushl $0 +; X32-DFlyBSD-NEXT: pushl $48 +; X32-DFlyBSD-NEXT: calll __morestack +; X32-DFlyBSD-NEXT: ret + +; X64-DFlyBSD-LABEL: test_basic: + +; X64-DFlyBSD: cmpq %fs:32, %rsp +; X64-DFlyBSD-NEXT: ja .LBB0_2 + +; X64-DFlyBSD: movabsq $40, %r10 +; X64-DFlyBSD-NEXT: movabsq $0, %r11 +; X64-DFlyBSD-NEXT: callq __morestack +; X64-DFlyBSD-NEXT: ret + } define i32 @test_nested(i32 * nest %closure, i32 %other) #0 { @@ -129,6 +176,16 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 { ; X64-Linux-NEXT: ret ; X64-Linux-NEXT: movq %rax, %r10 +; X32ABI: cmpl %fs:64, %esp +; X32ABI-NEXT: ja .LBB1_2 + +; X32ABI: movl %r10d, %eax +; X32ABI-NEXT: movl $56, %r10d +; X32ABI-NEXT: movl $0, %r11d +; X32ABI-NEXT: callq __morestack +; X32ABI-NEXT: ret +; X32ABI-NEXT: movq %rax, %r10 + ; X32-Darwin: movl $432, %edx ; X32-Darwin-NEXT: cmpl %gs:(%edx), %esp ; X32-Darwin-NEXT: ja LBB1_2 @@ -177,6 +234,24 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) #0 { ; X64-FreeBSD-NEXT: ret ; X64-FreeBSD-NEXT: movq %rax, %r10 +; X32-DFlyBSD: cmpl %fs:16, %esp +; X32-DFlyBSD-NEXT: ja .LBB1_2 + +; X32-DFlyBSD: pushl $4 +; X32-DFlyBSD-NEXT: pushl $52 +; X32-DFlyBSD-NEXT: calll __morestack +; X32-DFlyBSD-NEXT: ret + +; X64-DFlyBSD: cmpq %fs:32, %rsp +; X64-DFlyBSD-NEXT: ja .LBB1_2 + +; X64-DFlyBSD: movq %r10, %rax +; X64-DFlyBSD-NEXT: movabsq $56, %r10 +; X64-DFlyBSD-NEXT: movabsq $0, %r11 +; X64-DFlyBSD-NEXT: callq __morestack +; X64-DFlyBSD-NEXT: ret +; X64-DFlyBSD-NEXT: movq %rax, %r10 + } define void @test_large() #0 { @@ -202,6 +277,15 @@ define void @test_large() #0 { ; X64-Linux-NEXT: callq __morestack ; X64-Linux-NEXT: ret +; X32ABI: leal -40008(%rsp), %r11d +; X32ABI-NEXT: cmpl %fs:64, %r11d +; X32ABI-NEXT: ja .LBB2_2 + +; X32ABI: movl $40008, %r10d +; X32ABI-NEXT: movl $0, %r11d +; X32ABI-NEXT: callq __morestack +; X32ABI-NEXT: ret + ; X32-Darwin: leal -40012(%esp), %ecx ; X32-Darwin-NEXT: movl $432, %eax ; X32-Darwin-NEXT: cmpl %gs:(%eax), %ecx @@ -249,6 +333,24 @@ define void @test_large() #0 { ; X64-FreeBSD-NEXT: callq __morestack ; X64-FreeBSD-NEXT: ret +; X32-DFlyBSD: leal -40008(%esp), %ecx +; X32-DFlyBSD-NEXT: cmpl %fs:16, %ecx +; X32-DFlyBSD-NEXT: ja .LBB2_2 + +; X32-DFlyBSD: pushl $0 +; X32-DFlyBSD-NEXT: pushl $40008 +; X32-DFlyBSD-NEXT: calll __morestack +; X32-DFlyBSD-NEXT: ret + +; X64-DFlyBSD: leaq -40008(%rsp), %r11 +; X64-DFlyBSD-NEXT: cmpq %fs:32, %r11 +; X64-DFlyBSD-NEXT: ja .LBB2_2 + +; X64-DFlyBSD: movabsq $40008, %r10 +; X64-DFlyBSD-NEXT: movabsq $0, %r11 +; X64-DFlyBSD-NEXT: callq __morestack +; X64-DFlyBSD-NEXT: ret + } define fastcc void @test_fastcc() #0 { @@ -276,6 +378,16 @@ define fastcc void @test_fastcc() #0 { ; X64-Linux-NEXT: callq __morestack ; X64-Linux-NEXT: ret +; X32ABI-LABEL: test_fastcc: + +; X32ABI: cmpl %fs:64, %esp +; X32ABI-NEXT: ja .LBB3_2 + +; X32ABI: movl $40, %r10d +; X32ABI-NEXT: movl $0, %r11d +; X32ABI-NEXT: callq __morestack +; X32ABI-NEXT: ret + ; X32-Darwin-LABEL: test_fastcc: ; X32-Darwin: movl $432, %eax @@ -327,6 +439,26 @@ define fastcc void @test_fastcc() #0 { ; X64-FreeBSD-NEXT: callq __morestack ; X64-FreeBSD-NEXT: ret +; X32-DFlyBSD-LABEL: test_fastcc: + +; X32-DFlyBSD: cmpl %fs:16, %esp +; X32-DFlyBSD-NEXT: ja .LBB3_2 + +; X32-DFlyBSD: pushl $0 +; X32-DFlyBSD-NEXT: pushl $48 +; X32-DFlyBSD-NEXT: calll __morestack +; X32-DFlyBSD-NEXT: ret + +; X64-DFlyBSD-LABEL: test_fastcc: + +; X64-DFlyBSD: cmpq %fs:32, %rsp +; X64-DFlyBSD-NEXT: ja .LBB3_2 + +; X64-DFlyBSD: movabsq $40, %r10 +; X64-DFlyBSD-NEXT: movabsq $0, %r11 +; X64-DFlyBSD-NEXT: callq __morestack +; X64-DFlyBSD-NEXT: ret + } define fastcc void @test_fastcc_large() #0 { @@ -356,6 +488,17 @@ define fastcc void @test_fastcc_large() #0 { ; X64-Linux-NEXT: callq __morestack ; X64-Linux-NEXT: ret +; X32ABI-LABEL: test_fastcc_large: + +; X32ABI: leal -40008(%rsp), %r11d +; X32ABI-NEXT: cmpl %fs:64, %r11d +; X32ABI-NEXT: ja .LBB4_2 + +; X32ABI: movl $40008, %r10d +; X32ABI-NEXT: movl $0, %r11d +; X32ABI-NEXT: callq __morestack +; X32ABI-NEXT: ret + ; X32-Darwin-LABEL: test_fastcc_large: ; X32-Darwin: leal -40012(%esp), %eax @@ -412,6 +555,28 @@ define fastcc void @test_fastcc_large() #0 { ; X64-FreeBSD-NEXT: callq __morestack ; X64-FreeBSD-NEXT: ret +; X32-DFlyBSD-LABEL: test_fastcc_large: + +; X32-DFlyBSD: leal -40008(%esp), %eax +; X32-DFlyBSD-NEXT: cmpl %fs:16, %eax +; X32-DFlyBSD-NEXT: ja .LBB4_2 + +; X32-DFlyBSD: pushl $0 +; X32-DFlyBSD-NEXT: pushl $40008 +; X32-DFlyBSD-NEXT: calll __morestack +; X32-DFlyBSD-NEXT: ret + +; X64-DFlyBSD-LABEL: test_fastcc_large: + +; X64-DFlyBSD: leaq -40008(%rsp), %r11 +; X64-DFlyBSD-NEXT: cmpq %fs:32, %r11 +; X64-DFlyBSD-NEXT: ja .LBB4_2 + +; X64-DFlyBSD: movabsq $40008, %r10 +; X64-DFlyBSD-NEXT: movabsq $0, %r11 +; X64-DFlyBSD-NEXT: callq __morestack +; X64-DFlyBSD-NEXT: ret + } define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) #0 { @@ -446,6 +611,9 @@ define void @test_nostack() #0 { ; X64-Linux-LABEL: test_nostack: ; X32-Linux-NOT: callq __morestack +; X32ABI-LABEL: test_nostack: +; X32ABI-NOT: callq __morestack + ; X32-Darwin-LABEL: test_nostack: ; X32-Darwin-NOT: calll __morestack @@ -460,6 +628,16 @@ define void @test_nostack() #0 { ; X64-FreeBSD-LABEL: test_nostack: ; X64-FreeBSD-NOT: callq __morestack + +; X32-DFlyBSD-LABEL: test_nostack: +; X32-DFlyBSD-NOT: calll __morestack + +; X64-DFlyBSD-LABEL: test_nostack: +; X64-DFlyBSD-NOT: callq __morestack } attributes #0 = { "split-stack" } + +; X64-Linux-Large: .rodata +; X64-Linux-Large-NEXT: __morestack_addr: +; X64-Linux-Large-NEXT: .quad __morestack diff --git a/test/CodeGen/X86/seh-basic.ll b/test/CodeGen/X86/seh-basic.ll new file mode 100644 index 000000000000..69d70d70948c --- /dev/null +++ b/test/CodeGen/X86/seh-basic.ll @@ -0,0 +1,175 @@ +; RUN: llc -mtriple x86_64-pc-windows-msvc < %s | FileCheck %s + +define void @two_invoke_merged() { +entry: + invoke void @try_body() + to label %again unwind label %lpad + +again: + invoke void @try_body() + to label %done unwind label %lpad + +done: + ret void + +lpad: + %vals = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) + catch i8* bitcast (i32 (i8*, i8*)* @filt0 to i8*) + catch i8* bitcast (i32 (i8*, i8*)* @filt1 to i8*) + %sel = extractvalue { i8*, i32 } %vals, 1 + call void @use_selector(i32 %sel) + ret void +} + +; Normal path code + +; CHECK-LABEL: {{^}}two_invoke_merged: +; CHECK: .seh_proc two_invoke_merged +; CHECK: .seh_handler __C_specific_handler, @unwind, @except +; CHECK: .Ltmp0: +; CHECK: callq try_body +; CHECK-NEXT: .Ltmp1: +; CHECK: .Ltmp2: +; CHECK: callq try_body +; CHECK-NEXT: .Ltmp3: +; CHECK: retq + +; Landing pad code + +; CHECK: .Ltmp5: +; CHECK: movl $1, %ecx +; CHECK: jmp +; CHECK: .Ltmp6: +; CHECK: movl $2, %ecx +; CHECK: callq use_selector + +; CHECK: .seh_handlerdata +; CHECK-NEXT: .long 2 +; CHECK-NEXT: .long .Ltmp0@IMGREL +; CHECK-NEXT: .long .Ltmp3@IMGREL+1 +; CHECK-NEXT: .long filt0@IMGREL +; CHECK-NEXT: .long .Ltmp5@IMGREL +; CHECK-NEXT: .long .Ltmp0@IMGREL +; CHECK-NEXT: .long .Ltmp3@IMGREL+1 +; CHECK-NEXT: .long filt1@IMGREL +; CHECK-NEXT: .long .Ltmp6@IMGREL +; CHECK: .text +; CHECK: .seh_endproc + +define void @two_invoke_gap() { +entry: + invoke void @try_body() + to label %again unwind label %lpad + +again: + call void @do_nothing_on_unwind() + invoke void @try_body() + to label %done unwind label %lpad + +done: + ret void + +lpad: + %vals = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) + catch i8* bitcast (i32 (i8*, i8*)* @filt0 to i8*) + %sel = extractvalue { i8*, i32 } %vals, 1 + call void @use_selector(i32 %sel) + ret void +} + +; Normal path code + +; CHECK-LABEL: {{^}}two_invoke_gap: +; CHECK: .seh_proc two_invoke_gap +; CHECK: .seh_handler __C_specific_handler, @unwind, @except +; CHECK: .Ltmp11: +; CHECK: callq try_body +; CHECK-NEXT: .Ltmp12: +; CHECK: callq do_nothing_on_unwind +; CHECK: .Ltmp13: +; CHECK: callq try_body +; CHECK-NEXT: .Ltmp14: +; CHECK: retq + +; Landing pad code + +; CHECK: .Ltmp16: +; CHECK: movl $1, %ecx +; CHECK: callq use_selector + +; CHECK: .seh_handlerdata +; CHECK-NEXT: .long 2 +; CHECK-NEXT: .long .Ltmp11@IMGREL +; CHECK-NEXT: .long .Ltmp12@IMGREL+1 +; CHECK-NEXT: .long filt0@IMGREL +; CHECK-NEXT: .long .Ltmp16@IMGREL +; CHECK-NEXT: .long .Ltmp13@IMGREL +; CHECK-NEXT: .long .Ltmp14@IMGREL+1 +; CHECK-NEXT: .long filt0@IMGREL +; CHECK-NEXT: .long .Ltmp16@IMGREL +; CHECK: .text +; CHECK: .seh_endproc + +define void @two_invoke_nounwind_gap() { +entry: + invoke void @try_body() + to label %again unwind label %lpad + +again: + call void @cannot_unwind() + invoke void @try_body() + to label %done unwind label %lpad + +done: + ret void + +lpad: + %vals = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) + catch i8* bitcast (i32 (i8*, i8*)* @filt0 to i8*) + %sel = extractvalue { i8*, i32 } %vals, 1 + call void @use_selector(i32 %sel) + ret void +} + +; Normal path code + +; CHECK-LABEL: {{^}}two_invoke_nounwind_gap: +; CHECK: .seh_proc two_invoke_nounwind_gap +; CHECK: .seh_handler __C_specific_handler, @unwind, @except +; CHECK: .Ltmp21: +; CHECK: callq try_body +; CHECK-NEXT: .Ltmp22: +; CHECK: callq cannot_unwind +; CHECK: .Ltmp23: +; CHECK: callq try_body +; CHECK-NEXT: .Ltmp24: +; CHECK: retq + +; Landing pad code + +; CHECK: .Ltmp26: +; CHECK: movl $1, %ecx +; CHECK: callq use_selector + +; CHECK: .seh_handlerdata +; CHECK-NEXT: .long 1 +; CHECK-NEXT: .long .Ltmp21@IMGREL +; CHECK-NEXT: .long .Ltmp24@IMGREL+1 +; CHECK-NEXT: .long filt0@IMGREL +; CHECK-NEXT: .long .Ltmp26@IMGREL +; CHECK: .text +; CHECK: .seh_endproc + +declare void @try_body() +declare void @do_nothing_on_unwind() +declare void @cannot_unwind() nounwind +declare void @use_selector(i32) + +declare i32 @filt0(i8* %eh_info, i8* %rsp) +declare i32 @filt1(i8* %eh_info, i8* %rsp) + +declare void @handler0() +declare void @handler1() + +declare i32 @__C_specific_handler(...) +declare i32 @llvm.eh.typeid.for(i8*) readnone nounwind diff --git a/test/CodeGen/X86/seh-safe-div.ll b/test/CodeGen/X86/seh-safe-div.ll new file mode 100644 index 000000000000..e911df04ded4 --- /dev/null +++ b/test/CodeGen/X86/seh-safe-div.ll @@ -0,0 +1,196 @@ +; RUN: llc -mtriple x86_64-pc-windows-msvc < %s | FileCheck %s + +; This test case is also intended to be run manually as a complete functional +; test. It should link, print something, and exit zero rather than crashing. +; It is the hypothetical lowering of a C source program that looks like: +; +; int safe_div(int *n, int *d) { +; int r; +; __try { +; __try { +; r = *n / *d; +; } __except(GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION) { +; puts("EXCEPTION_ACCESS_VIOLATION"); +; r = -1; +; } +; } __except(GetExceptionCode() == EXCEPTION_INT_DIVIDE_BY_ZERO) { +; puts("EXCEPTION_INT_DIVIDE_BY_ZERO"); +; r = -2; +; } +; return r; +; } + +@str1 = internal constant [27 x i8] c"EXCEPTION_ACCESS_VIOLATION\00" +@str2 = internal constant [29 x i8] c"EXCEPTION_INT_DIVIDE_BY_ZERO\00" + +define i32 @safe_div(i32* %n, i32* %d) { +entry: + %r = alloca i32, align 4 + invoke void @try_body(i32* %r, i32* %n, i32* %d) + to label %__try.cont unwind label %lpad + +lpad: + %vals = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*) + catch i8* bitcast (i32 (i8*, i8*)* @safe_div_filt0 to i8*) + catch i8* bitcast (i32 (i8*, i8*)* @safe_div_filt1 to i8*) + %ehptr = extractvalue { i8*, i32 } %vals, 0 + %sel = extractvalue { i8*, i32 } %vals, 1 + %filt0_val = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @safe_div_filt0 to i8*)) + %is_filt0 = icmp eq i32 %sel, %filt0_val + br i1 %is_filt0, label %handler0, label %eh.dispatch1 + +eh.dispatch1: + %filt1_val = call i32 @llvm.eh.typeid.for(i8* bitcast (i32 (i8*, i8*)* @safe_div_filt1 to i8*)) + %is_filt1 = icmp eq i32 %sel, %filt1_val + br i1 %is_filt1, label %handler1, label %eh.resume + +handler0: + call void @puts(i8* getelementptr ([27 x i8]* @str1, i32 0, i32 0)) + store i32 -1, i32* %r, align 4 + br label %__try.cont + +handler1: + call void @puts(i8* getelementptr ([29 x i8]* @str2, i32 0, i32 0)) + store i32 -2, i32* %r, align 4 + br label %__try.cont + +eh.resume: + resume { i8*, i32 } %vals + +__try.cont: + %safe_ret = load i32* %r, align 4 + ret i32 %safe_ret +} + +; Normal path code + +; CHECK: {{^}}safe_div: +; CHECK: .seh_proc safe_div +; CHECK: .seh_handler __C_specific_handler, @unwind, @except +; CHECK: .Ltmp0: +; CHECK: leaq [[rloc:.*\(%rsp\)]], %rcx +; CHECK: callq try_body +; CHECK-NEXT: .Ltmp1 +; CHECK: .LBB0_7: +; CHECK: movl [[rloc]], %eax +; CHECK: retq + +; Landing pad code + +; CHECK: .Ltmp3: +; CHECK: movl $1, %[[sel:[a-z]+]] +; CHECK: .Ltmp4 +; CHECK: movl $2, %[[sel]] +; CHECK: .L{{.*}}: +; CHECK: cmpl $1, %[[sel]] + +; CHECK: # %handler0 +; CHECK: callq puts +; CHECK: movl $-1, [[rloc]] +; CHECK: jmp .LBB0_7 + +; CHECK: cmpl $2, %[[sel]] + +; CHECK: # %handler1 +; CHECK: callq puts +; CHECK: movl $-2, [[rloc]] +; CHECK: jmp .LBB0_7 + +; FIXME: EH preparation should not call _Unwind_Resume. +; CHECK: callq _Unwind_Resume +; CHECK: ud2 + +; CHECK: .seh_handlerdata +; CHECK: .long 2 +; CHECK: .long .Ltmp0@IMGREL +; CHECK: .long .Ltmp1@IMGREL+1 +; CHECK: .long safe_div_filt0@IMGREL +; CHECK: .long .Ltmp3@IMGREL +; CHECK: .long .Ltmp0@IMGREL +; CHECK: .long .Ltmp1@IMGREL+1 +; CHECK: .long safe_div_filt1@IMGREL +; CHECK: .long .Ltmp4@IMGREL +; CHECK: .text +; CHECK: .seh_endproc + + +define void @try_body(i32* %r, i32* %n, i32* %d) { +entry: + %0 = load i32* %n, align 4 + %1 = load i32* %d, align 4 + %div = sdiv i32 %0, %1 + store i32 %div, i32* %r, align 4 + ret void +} + +; The prototype of these filter functions is: +; int filter(EXCEPTION_POINTERS *eh_ptrs, void *rbp); + +; The definition of EXCEPTION_POINTERS is: +; typedef struct _EXCEPTION_POINTERS { +; EXCEPTION_RECORD *ExceptionRecord; +; CONTEXT *ContextRecord; +; } EXCEPTION_POINTERS; + +; The definition of EXCEPTION_RECORD is: +; typedef struct _EXCEPTION_RECORD { +; DWORD ExceptionCode; +; ... +; } EXCEPTION_RECORD; + +; The exception code can be retreived with two loads, one for the record +; pointer and one for the code. The values of local variables can be +; accessed via rbp, but that would require additional not yet implemented LLVM +; support. + +define i32 @safe_div_filt0(i8* %eh_ptrs, i8* %rbp) { + %eh_ptrs_c = bitcast i8* %eh_ptrs to i32** + %eh_rec = load i32** %eh_ptrs_c + %eh_code = load i32* %eh_rec + ; EXCEPTION_ACCESS_VIOLATION = 0xC0000005 + %cmp = icmp eq i32 %eh_code, 3221225477 + %filt.res = zext i1 %cmp to i32 + ret i32 %filt.res +} + +define i32 @safe_div_filt1(i8* %eh_ptrs, i8* %rbp) { + %eh_ptrs_c = bitcast i8* %eh_ptrs to i32** + %eh_rec = load i32** %eh_ptrs_c + %eh_code = load i32* %eh_rec + ; EXCEPTION_INT_DIVIDE_BY_ZERO = 0xC0000094 + %cmp = icmp eq i32 %eh_code, 3221225620 + %filt.res = zext i1 %cmp to i32 + ret i32 %filt.res +} + +@str_result = internal constant [21 x i8] c"safe_div result: %d\0A\00" + +define i32 @main() { + %d.addr = alloca i32, align 4 + %n.addr = alloca i32, align 4 + + store i32 10, i32* %n.addr, align 4 + store i32 2, i32* %d.addr, align 4 + %r1 = call i32 @safe_div(i32* %n.addr, i32* %d.addr) + call void (i8*, ...)* @printf(i8* getelementptr ([21 x i8]* @str_result, i32 0, i32 0), i32 %r1) + + store i32 10, i32* %n.addr, align 4 + store i32 0, i32* %d.addr, align 4 + %r2 = call i32 @safe_div(i32* %n.addr, i32* %d.addr) + call void (i8*, ...)* @printf(i8* getelementptr ([21 x i8]* @str_result, i32 0, i32 0), i32 %r2) + + %r3 = call i32 @safe_div(i32* %n.addr, i32* null) + call void (i8*, ...)* @printf(i8* getelementptr ([21 x i8]* @str_result, i32 0, i32 0), i32 %r3) + ret i32 0 +} + +define void @_Unwind_Resume() { + call void @abort() + unreachable +} + +declare i32 @__C_specific_handler(...) +declare i32 @llvm.eh.typeid.for(i8*) readnone nounwind +declare void @puts(i8*) +declare void @printf(i8*, ...) +declare void @abort() diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll index 654e8652cfcb..7e6f15321415 100644 --- a/test/CodeGen/X86/select.ll +++ b/test/CodeGen/X86/select.ll @@ -364,4 +364,40 @@ define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) { %tmp1 = select i1 %cc, i32 3, i32 2 %tmp2 = shl i32 %a, %tmp1 ret i32 %tmp2 -}
\ No newline at end of file +} + +define void @test19() { +; This is a massive reduction of an llvm-stress test case that generates +; interesting chains feeding setcc and eventually a f32 select operation. This +; is intended to exercise the SELECT formation in the DAG combine simplifying +; a simplified select_cc node. If it it regresses and is no longer triggering +; that code path, it can be deleted. +; +; CHECK-LABEL: @test19 +; CHECK: testb +; CHECK: cmpl +; CHECK: ucomiss + +BB: + br label %CF + +CF: + %Cmp10 = icmp ule i8 undef, undef + br i1 %Cmp10, label %CF, label %CF250 + +CF250: + %E12 = extractelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 2 + %Cmp32 = icmp ugt i1 %Cmp10, false + br i1 %Cmp32, label %CF, label %CF242 + +CF242: + %Cmp38 = icmp uge i32 %E12, undef + %FC = uitofp i1 %Cmp38 to float + %Sl59 = select i1 %Cmp32, float %FC, float undef + %Cmp60 = fcmp ugt float undef, undef + br i1 %Cmp60, label %CF242, label %CF244 + +CF244: + %B122 = fadd float %Sl59, undef + ret void +} diff --git a/test/CodeGen/X86/sext-i1.ll b/test/CodeGen/X86/sext-i1.ll index 64de0aee70d3..1a575db11b83 100644 --- a/test/CodeGen/X86/sext-i1.ll +++ b/test/CodeGen/X86/sext-i1.ll @@ -61,3 +61,36 @@ if.end: ; preds = %if.then, %entry %xor27 = xor i32 undef, %cond ; <i32> [#uses=0] ret i32 0 } + +define i32 @t4(i64 %x) nounwind readnone ssp { +entry: +; 32-LABEL: t4: +; 32: movl +; 32: orl +; 32: movl +; 32: je +; 32: xorl + +; 64-LABEL: t4: +; 64: cmpq $1 +; 64: sbbl + %0 = icmp eq i64 %x, 0 + %1 = sext i1 %0 to i32 + ret i32 %1 +} + +define i64 @t5(i32 %x) nounwind readnone ssp { +entry: +; 32-LABEL: t5: +; 32: cmpl $1 +; 32: sbbl +; 32: movl + +; 64-LABEL: t5: +; 64: cmpl $1 +; 64: sbbq + %0 = icmp eq i32 %x, 0 + %1 = sext i1 %0 to i64 + ret i64 %1 +} + diff --git a/test/CodeGen/X86/shrink-compare.ll b/test/CodeGen/X86/shrink-compare.ll index fc7ee061f35d..4ddef4ca5351 100644 --- a/test/CodeGen/X86/shrink-compare.ll +++ b/test/CodeGen/X86/shrink-compare.ll @@ -89,3 +89,151 @@ if.end: ; CHECK-NOT: cmpl $1,{{.*}}x+4 ; CHECK: ret } + +; CHECK-LABEL: test2_1: +; CHECK: movzbl +; CHECK: cmpl $256 +; CHECK: jne +define void @test2_1(i32 %X) nounwind minsize { +entry: + %and = and i32 %X, 255 + %cmp = icmp eq i32 %and, 256 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +; CHECK-LABEL: test_sext_i8_icmp_1: +; CHECK: cmpb $1, %{{dil|cl}} +define void @test_sext_i8_icmp_1(i8 %x) nounwind minsize { +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, 1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +; CHECK-LABEL: test_sext_i8_icmp_47: +; CHECK: cmpb $47, %{{dil|cl}} +define void @test_sext_i8_icmp_47(i8 %x) nounwind minsize { +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, 47 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +; CHECK-LABEL: test_sext_i8_icmp_127: +; CHECK: cmpb $127, %{{dil|cl}} +define void @test_sext_i8_icmp_127(i8 %x) nounwind minsize { +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, 127 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +; CHECK-LABEL: test_sext_i8_icmp_neg1: +; CHECK: cmpb $-1, %{{dil|cl}} +define void @test_sext_i8_icmp_neg1(i8 %x) nounwind minsize { +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, -1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +; CHECK-LABEL: test_sext_i8_icmp_neg2: +; CHECK: cmpb $-2, %{{dil|cl}} +define void @test_sext_i8_icmp_neg2(i8 %x) nounwind minsize { +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, -2 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +; CHECK-LABEL: test_sext_i8_icmp_neg127: +; CHECK: cmpb $-127, %{{dil|cl}} +define void @test_sext_i8_icmp_neg127(i8 %x) nounwind minsize { +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, -127 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +; CHECK-LABEL: test_sext_i8_icmp_neg128: +; CHECK: cmpb $-128, %{{dil|cl}} +define void @test_sext_i8_icmp_neg128(i8 %x) nounwind minsize { +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, -128 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} + +; CHECK-LABEL: test_sext_i8_icmp_255: +; CHECK: movb $1, +; CHECK: testb +; CHECK: jne +define void @test_sext_i8_icmp_255(i8 %x) nounwind minsize { +entry: + %sext = sext i8 %x to i32 + %cmp = icmp eq i32 %sext, 255 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @bar() nounwind + br label %if.end + +if.end: + ret void +} diff --git a/test/CodeGen/X86/sibcall-4.ll b/test/CodeGen/X86/sibcall-4.ll index 980b0f797ee1..2c7f51d28025 100644 --- a/test/CodeGen/X86/sibcall-4.ll +++ b/test/CodeGen/X86/sibcall-4.ll @@ -1,13 +1,13 @@ ; RUN: llc < %s -mtriple=i386-pc-linux-gnu | FileCheck %s ; pr7610 -define cc10 void @t(i32* %Base_Arg, i32* %Sp_Arg, i32* %Hp_Arg, i32 %R1_Arg) nounwind { +define ghccc void @t(i32* %Base_Arg, i32* %Sp_Arg, i32* %Hp_Arg, i32 %R1_Arg) nounwind { cm1: ; CHECK-LABEL: t: ; CHECK: jmpl *%eax %nm3 = getelementptr i32* %Sp_Arg, i32 1 %nm9 = load i32* %Sp_Arg %nma = inttoptr i32 %nm9 to void (i32*, i32*, i32*, i32)* - tail call cc10 void %nma(i32* %Base_Arg, i32* %nm3, i32* %Hp_Arg, i32 %R1_Arg) nounwind + tail call ghccc void %nma(i32* %Base_Arg, i32* %nm3, i32* %Hp_Arg, i32 %R1_Arg) nounwind ret void } diff --git a/test/CodeGen/X86/sibcall-5.ll b/test/CodeGen/X86/sibcall-5.ll index c04af234b131..b065cce17b24 100644 --- a/test/CodeGen/X86/sibcall-5.ll +++ b/test/CodeGen/X86/sibcall-5.ll @@ -62,4 +62,4 @@ declare i8* @objc_msgSend(i8*, i8*, ...) declare double @floor(double) optsize -!0 = metadata !{} +!0 = !{} diff --git a/test/CodeGen/X86/sincos-opt.ll b/test/CodeGen/X86/sincos-opt.ll index 2dc8816f840f..1e34a2be10b3 100644 --- a/test/CodeGen/X86/sincos-opt.ll +++ b/test/CodeGen/X86/sincos-opt.ll @@ -15,7 +15,8 @@ entry: ; OSX_SINCOS-LABEL: test1: ; OSX_SINCOS: callq ___sincosf_stret -; OSX_SINCOS: pshufd $1, %xmm0, %xmm1 +; OSX_SINCOS: movaps %xmm0, %xmm1 +; OSX_SINCOS: shufps {{.*}} ## xmm1 = xmm1[1,1,2,3] ; OSX_SINCOS: addss %xmm0, %xmm1 ; OSX_NOOPT: test1 diff --git a/test/CodeGen/X86/sink-blockfreq.ll b/test/CodeGen/X86/sink-blockfreq.ll new file mode 100644 index 000000000000..c2f0411901a7 --- /dev/null +++ b/test/CodeGen/X86/sink-blockfreq.ll @@ -0,0 +1,45 @@ +; RUN: llc -disable-machine-licm -machine-sink-bfi=true -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_BFI +; RUN: llc -disable-machine-licm -machine-sink-bfi=false -mtriple=x86_64-apple-darwin < %s | FileCheck %s -check-prefix=MSINK_NOBFI + +; Test that by changing BlockFrequencyInfo we change the order in which +; machine-sink looks for sucessor blocks. By not using BFI, both G and B +; have the same loop depth and no instructions is sinked - B is selected but +; can't be used as to avoid breaking a non profitable critical edge. By using +; BFI, "mul" is sinked into the less frequent block G. +define i32 @sink_freqinfo(i32 %a, i32 %b) nounwind uwtable ssp { +; MSINK_BFI-LABEL: sink_freqinfo +; MSINK_BFI: jl +; MSINK_BFI-NEXT: ## BB# +; MSINK_BFI-NEXT: imull + +; MSINK_NOBFI-LABEL: sink_freqinfo +; MSINK_NOBFI: imull +; MSINK_NOBFI: jl +entry: + br label %B + +B: + %ee = phi i32 [ 0, %entry ], [ %inc, %F ] + %xx = sub i32 %a, %ee + %cond0 = icmp slt i32 %xx, 0 + br i1 %cond0, label %F, label %exit, !prof !0 + +F: + %inc = add nsw i32 %xx, 2 + %aa = mul nsw i32 %b, %inc + %exitcond = icmp slt i32 %inc, %a + br i1 %exitcond, label %B, label %G, !prof !1 + +G: + %ii = add nsw i32 %aa, %a + %ll = add i32 %b, 45 + %exitcond2 = icmp sge i32 %ii, %b + br i1 %exitcond2, label %G, label %exit, !prof !2 + +exit: + ret i32 0 +} + +!0 = !{!"branch_weights", i32 4, i32 1} +!1 = !{!"branch_weights", i32 128, i32 1} +!2 = !{!"branch_weights", i32 1, i32 1} diff --git a/test/CodeGen/X86/sink-hoist.ll b/test/CodeGen/X86/sink-hoist.ll index 64f5311792db..455cf24bce1c 100644 --- a/test/CodeGen/X86/sink-hoist.ll +++ b/test/CodeGen/X86/sink-hoist.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86-64 -asm-verbose=false -mtriple=x86_64-unknown-linux-gnu -mcpu=nehalem -post-RA-scheduler=true -schedmodel=false | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -march=x86-64 -asm-verbose=false -mtriple=x86_64-unknown-linux-gnu -mcpu=nehalem -post-RA-scheduler=true -schedmodel=false | FileCheck %s ; Currently, floating-point selects are lowered to CFG triangles. ; This means that one side of the select is always unconditionally diff --git a/test/CodeGen/X86/sink-out-of-loop.ll b/test/CodeGen/X86/sink-out-of-loop.ll index c600f925a32b..6757f315b6da 100644 --- a/test/CodeGen/X86/sink-out-of-loop.ll +++ b/test/CodeGen/X86/sink-out-of-loop.ll @@ -5,7 +5,7 @@ ; MOV32ri outside the loop. ; rdar://11980766 define i32 @sink_succ(i32 %argc, i8** nocapture %argv) nounwind uwtable ssp { -; CHECK: sink_succ +; CHECK-LABEL: sink_succ ; CHECK: [[OUTER_LN1:LBB0_[0-9]+]]: ## %preheader ; CHECK: %exit ; CHECK-NOT: movl @@ -52,3 +52,24 @@ for.body2: for.end20: ret i32 0 } + +define i32 @sink_out_of_loop(i32 %n, i32* %output) { +; CHECK-LABEL: sink_out_of_loop: +entry: + br label %loop + +loop: + %i = phi i32 [ 0, %entry ], [ %i2, %loop ] + %j = mul i32 %i, %i + %addr = getelementptr i32* %output, i32 %i + store i32 %i, i32* %addr + %i2 = add i32 %i, 1 + %exit_cond = icmp sge i32 %i2, %n + br i1 %exit_cond, label %exit, label %loop + +exit: +; CHECK: BB#2 +; CHECK: imull %eax, %eax +; CHECK: retq + ret i32 %j +} diff --git a/test/CodeGen/X86/sjlj-baseptr.ll b/test/CodeGen/X86/sjlj-baseptr.ll new file mode 100644 index 000000000000..e439ff4dbd2f --- /dev/null +++ b/test/CodeGen/X86/sjlj-baseptr.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X86 %s +; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X64 %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%Foo = type { [125 x i8] } + +declare i32 @llvm.eh.sjlj.setjmp(i8*) nounwind + +declare void @whatever(i64, %Foo*, i8**, i8*, i8*, i32) #0 + +attributes #0 = { nounwind uwtable "no-frame-pointer-elim"="true" } + +define i32 @test1(i64 %n, %Foo* byval nocapture readnone align 8 %f) #0 { +entry: + %buf = alloca [5 x i8*], align 16 + %p = alloca i8*, align 8 + %q = alloca i8, align 64 + %r = bitcast [5 x i8*]* %buf to i8* + %s = alloca i8, i64 %n, align 1 + store i8* %s, i8** %p, align 8 + %t = call i32 @llvm.eh.sjlj.setjmp(i8* %s) + call void @whatever(i64 %n, %Foo* %f, i8** %p, i8* %q, i8* %s, i32 %t) #1 + ret i32 0 +; X86: movl %esp, %esi +; X86: movl %esp, -16(%ebp) +; X86: {{.LBB.*:}} +; X86: movl -16(%ebp), %esi +; X86: {{.LBB.*:}} +; X64: movq %rsp, %rbx +; X64: movq %rsp, -48(%rbp) +; X64: {{.LBB.*:}} +; X64: movq -48(%rbp), %rbx +; X64: {{.LBB.*:}} +} + + diff --git a/test/CodeGen/X86/slow-div.ll b/test/CodeGen/X86/slow-div.ll new file mode 100644 index 000000000000..52223824bf96 --- /dev/null +++ b/test/CodeGen/X86/slow-div.ll @@ -0,0 +1,28 @@ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivl-to-divb < %s | FileCheck -check-prefix=DIV32 %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+idivq-to-divw < %s | FileCheck -check-prefix=DIV64 %s + +define i32 @div32(i32 %a, i32 %b) { +entry: +; DIV32-LABEL: div32: +; DIV32: orl %{{.*}}, [[REG:%[a-z]+]] +; DIV32: testl $-256, [[REG]] +; DIV32: divb +; DIV64-LABEL: div32: +; DIV64-NOT: divb + %div = sdiv i32 %a, %b + ret i32 %div +} + +define i64 @div64(i64 %a, i64 %b) { +entry: +; DIV32-LABEL: div64: +; DIV32-NOT: divw +; DIV64-LABEL: div64: +; DIV64: orq %{{.*}}, [[REG:%[a-z]+]] +; DIV64: testq $-65536, [[REG]] +; DIV64: divw + %div = sdiv i64 %a, %b + ret i64 %div +} + + diff --git a/test/CodeGen/X86/slow-incdec.ll b/test/CodeGen/X86/slow-incdec.ll new file mode 100644 index 000000000000..323e3ae8c472 --- /dev/null +++ b/test/CodeGen/X86/slow-incdec.ll @@ -0,0 +1,80 @@ +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-slow-incdec < %s | FileCheck -check-prefix=INCDEC %s +; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+slow-incdec < %s | FileCheck -check-prefix=ADD %s + +; check -mattr=-slow-incdec +; INCDEC-NOT: addl $-1 +; INCDEC: dec +; INCDEC-NOT: addl $1 +; INCDEC: inc + +; check -mattr=+slow-incdec +; ADD: addl $-1 +; ADD-NOT: dec +; ADD: addl $1 +; ADD-NOT: inc + +; Function Attrs: nounwind readonly +define i32 @slow_1(i32* nocapture readonly %a, i32 %s) #0 { +entry: + %cmp5 = icmp eq i32 %s, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond: ; preds = %for.body + %cmp = icmp eq i32 %dec, 0 + br i1 %cmp, label %for.end.loopexit, label %for.body + +for.body: ; preds = %for.body.preheader, %for.cond + %i.06 = phi i32 [ %dec, %for.cond ], [ %s, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32* %a, i32 %i.06 + %0 = load i32* %arrayidx, align 4, !tbaa !1 + %cmp1 = icmp eq i32 %0, 0 +; + %dec = add nsw i32 %i.06, -1 + br i1 %cmp1, label %for.end.loopexit, label %for.cond + +for.end.loopexit: ; preds = %for.cond, %for.body + %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ] + ret i32 %i.0.lcssa +} + +; Function Attrs: nounwind readonly +define i32 @slow_2(i32* nocapture readonly %a, i32 %s) #0 { +entry: + %cmp5 = icmp eq i32 %s, 0 + br i1 %cmp5, label %for.end, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond: ; preds = %for.body + %cmp = icmp eq i32 %inc, 0 + br i1 %cmp, label %for.end.loopexit, label %for.body + +for.body: ; preds = %for.body.preheader, %for.cond + %i.06 = phi i32 [ %inc, %for.cond ], [ %s, %for.body.preheader ] + %arrayidx = getelementptr inbounds i32* %a, i32 %i.06 + %0 = load i32* %arrayidx, align 4, !tbaa !1 + %cmp1 = icmp eq i32 %0, 0 + %inc = add nsw i32 %i.06, 1 + br i1 %cmp1, label %for.end.loopexit, label %for.cond + +for.end.loopexit: ; preds = %for.cond, %for.body + %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ] + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ] + ret i32 %i.0.lcssa +} + +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C/C++ TBAA"} diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll index 1b596b589899..3c03750199cb 100644 --- a/test/CodeGen/X86/small-byval-memcpy.ll +++ b/test/CodeGen/X86/small-byval-memcpy.ll @@ -1,20 +1,25 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=core2 | grep movsd | count 8 -; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=nehalem | grep movups | count 2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s --check-prefix=CORE2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck %s --check-prefix=NEHALEM +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 -define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 4 %z) nounwind { -entry: - %iz = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=3] - %tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1 ; <x86_fp80*> [#uses=1] - %tmp2 = load x86_fp80* %tmp1, align 16 ; <x86_fp80> [#uses=1] - %tmp3 = fsub x86_fp80 0xK80000000000000000000, %tmp2 ; <x86_fp80> [#uses=1] - %tmp4 = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 1 ; <x86_fp80*> [#uses=1] - %real = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 0 ; <x86_fp80*> [#uses=1] - %tmp6 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 0 ; <x86_fp80*> [#uses=1] - %tmp7 = load x86_fp80* %tmp6, align 16 ; <x86_fp80> [#uses=1] - store x86_fp80 %tmp3, x86_fp80* %real, align 16 - store x86_fp80 %tmp7, x86_fp80* %tmp4, align 16 - call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval align 4 %iz ) nounwind - ret void -} +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) + +define void @copy16bytes(i8* nocapture %a, i8* nocapture readonly %b) { + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 16, i32 1, i1 false) + ret void + + ; CHECK-LABEL: copy16bytes + ; CORE2: movq + ; CORE2-NEXT: movq + ; CORE2-NEXT: movq + ; CORE2-NEXT: movq + ; CORE2-NEXT: retq -declare void @ccoshl({ x86_fp80, x86_fp80 }* noalias sret , { x86_fp80, x86_fp80 }* byval align 4 ) nounwind + ; NEHALEM: movups + ; NEHALEM-NEXT: movups + ; NEHALEM-NEXT: retq + + ; BTVER2: movups + ; BTVER2-NEXT: movups + ; BTVER2-NEXT: retq +} diff --git a/test/CodeGen/X86/splat-for-size.ll b/test/CodeGen/X86/splat-for-size.ll new file mode 100644 index 000000000000..c052ad2aa146 --- /dev/null +++ b/test/CodeGen/X86/splat-for-size.ll @@ -0,0 +1,141 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s -check-prefix=CHECK --check-prefix=AVX2 + +; Check constant loads of every 128-bit and 256-bit vector type +; for size optimization using splat ops available with AVX and AVX2. + +; There is no AVX broadcast from double to 128-bit vector because movddup has been around since SSE3 (grrr). +define <2 x double> @splat_v2f64(<2 x double> %x) #0 { + %add = fadd <2 x double> %x, <double 1.0, double 1.0> + ret <2 x double> %add +; CHECK-LABEL: splat_v2f64 +; CHECK: vmovddup +; CHECK: vaddpd +; CHECK-NEXT: retq +} + +define <4 x double> @splat_v4f64(<4 x double> %x) #0 { + %add = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0> + ret <4 x double> %add +; CHECK-LABEL: splat_v4f64 +; CHECK: vbroadcastsd +; CHECK-NEXT: vaddpd +; CHECK-NEXT: retq +} + +define <4 x float> @splat_v4f32(<4 x float> %x) #0 { + %add = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0> + ret <4 x float> %add +; CHECK-LABEL: splat_v4f32 +; CHECK: vbroadcastss +; CHECK-NEXT: vaddps +; CHECK-NEXT: retq +} + +define <8 x float> @splat_v8f32(<8 x float> %x) #0 { + %add = fadd <8 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> + ret <8 x float> %add +; CHECK-LABEL: splat_v8f32 +; CHECK: vbroadcastss +; CHECK-NEXT: vaddps +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value. +; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq. +define <2 x i64> @splat_v2i64(<2 x i64> %x) #0 { + %add = add <2 x i64> %x, <i64 1, i64 1> + ret <2 x i64> %add +; CHECK-LABEL: splat_v2i64 +; CHECK: vmovddup +; CHECK: vpaddq +; CHECK-NEXT: retq +} + +; AVX can't do 256-bit integer ops, so we split this into two 128-bit vectors, +; and then we fake it: use vmovddup to splat 64-bit value. +define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 { + %add = add <4 x i64> %x, <i64 1, i64 1, i64 1, i64 1> + ret <4 x i64> %add +; CHECK-LABEL: splat_v4i64 +; AVX: vmovddup +; AVX: vpaddq +; AVX: vpaddq +; AVX2: vpbroadcastq +; AVX2: vpaddq +; CHECK: retq +} + +; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value. +define <4 x i32> @splat_v4i32(<4 x i32> %x) #0 { + %add = add <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> + ret <4 x i32> %add +; CHECK-LABEL: splat_v4i32 +; AVX: vbroadcastss +; AVX2: vpbroadcastd +; CHECK-NEXT: vpaddd +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value. +define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 { + %add = add <8 x i32> %x, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + ret <8 x i32> %add +; CHECK-LABEL: splat_v8i32 +; AVX: vbroadcastss +; AVX: vpaddd +; AVX: vpaddd +; AVX2: vpbroadcastd +; AVX2: vpaddd +; CHECK: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc? +define <8 x i16> @splat_v8i16(<8 x i16> %x) #0 { + %add = add <8 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + ret <8 x i16> %add +; CHECK-LABEL: splat_v8i16 +; AVX-NOT: broadcast +; AVX2: vpbroadcastw +; CHECK: vpaddw +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc? +define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 { + %add = add <16 x i16> %x, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> + ret <16 x i16> %add +; CHECK-LABEL: splat_v16i16 +; AVX-NOT: broadcast +; AVX: vpaddw +; AVX: vpaddw +; AVX2: vpbroadcastw +; AVX2: vpaddw +; CHECK: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc? +define <16 x i8> @splat_v16i8(<16 x i8> %x) #0 { + %add = add <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> + ret <16 x i8> %add +; CHECK-LABEL: splat_v16i8 +; AVX-NOT: broadcast +; AVX2: vpbroadcastb +; CHECK: vpaddb +; CHECK-NEXT: retq +} + +; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc? +define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 { + %add = add <32 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> + ret <32 x i8> %add +; CHECK-LABEL: splat_v32i8 +; AVX-NOT: broadcast +; AVX: vpaddb +; AVX: vpaddb +; AVX2: vpbroadcastb +; AVX2: vpaddb +; CHECK: retq +} + +attributes #0 = { optsize } diff --git a/test/CodeGen/X86/splat-scalar-load.ll b/test/CodeGen/X86/splat-scalar-load.ll deleted file mode 100644 index 4d59b9cc2f63..000000000000 --- a/test/CodeGen/X86/splat-scalar-load.ll +++ /dev/null @@ -1,17 +0,0 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 -mcpu=nehalem | FileCheck %s -; rdar://7434544 - -define <2 x i64> @t2() nounwind { -entry: -; CHECK-LABEL: t2: -; CHECK: pshufd $85, (%esp), %xmm0 - %array = alloca [8 x float], align 4 - %arrayidx = getelementptr inbounds [8 x float]* %array, i32 0, i32 1 - %tmp2 = load float* %arrayidx - %vecinit = insertelement <4 x float> undef, float %tmp2, i32 0 - %vecinit5 = insertelement <4 x float> %vecinit, float %tmp2, i32 1 - %vecinit7 = insertelement <4 x float> %vecinit5, float %tmp2, i32 2 - %vecinit9 = insertelement <4 x float> %vecinit7, float %tmp2, i32 3 - %0 = bitcast <4 x float> %vecinit9 to <2 x i64> - ret <2 x i64> %0 -} diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll index fc79e31e72ee..24b175eed7a3 100644 --- a/test/CodeGen/X86/sqrt-fastmath.ll +++ b/test/CodeGen/X86/sqrt-fastmath.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mcpu=core2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 ; generated using "clang -S -O2 -ffast-math -emit-llvm sqrt.c" from ; #include <math.h> @@ -52,9 +53,80 @@ entry: ret x86_fp80 %call } -; Function Attrs: nounwind readnone declare x86_fp80 @__sqrtl_finite(x86_fp80) #1 +declare float @llvm.sqrt.f32(float) #1 +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #1 +declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1 + +; If the target's sqrtss and divss instructions are substantially +; slower than rsqrtss with a Newton-Raphson refinement, we should +; generate the estimate sequence. + +define float @reciprocal_square_root(float %x) #0 { + %sqrt = tail call float @llvm.sqrt.f32(float %x) + %div = fdiv fast float 1.0, %sqrt + ret float %div + +; CHECK-LABEL: reciprocal_square_root: +; CHECK: sqrtss +; CHECK-NEXT: movss +; CHECK-NEXT: divss +; CHECK-NEXT: retq +; BTVER2-LABEL: reciprocal_square_root: +; BTVER2: vrsqrtss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: vaddss +; BTVER2-NEXT: vmulss +; BTVER2-NEXT: retq +} + +define <4 x float> @reciprocal_square_root_v4f32(<4 x float> %x) #0 { + %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) + %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt + ret <4 x float> %div + +; CHECK-LABEL: reciprocal_square_root_v4f32: +; CHECK: sqrtps +; CHECK-NEXT: movaps +; CHECK-NEXT: divps +; CHECK-NEXT: retq +; BTVER2-LABEL: reciprocal_square_root_v4f32: +; BTVER2: vrsqrtps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vaddps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: retq +} + +define <8 x float> @reciprocal_square_root_v8f32(<8 x float> %x) #0 { + %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) + %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt + ret <8 x float> %div + +; CHECK-LABEL: reciprocal_square_root_v8f32: +; CHECK: sqrtps +; CHECK-NEXT: sqrtps +; CHECK-NEXT: movaps +; CHECK-NEXT: movaps +; CHECK-NEXT: divps +; CHECK-NEXT: divps +; CHECK-NEXT: retq +; BTVER2-LABEL: reciprocal_square_root_v8f32: +; BTVER2: vrsqrtps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: vaddps +; BTVER2-NEXT: vmulps +; BTVER2-NEXT: retq +} + + attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #2 = { nounwind readnone } diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll index 2351fd6fa77b..396da0f48956 100644 --- a/test/CodeGen/X86/sse-align-12.ll +++ b/test/CodeGen/X86/sse-align-12.ll @@ -1,9 +1,11 @@ -; RUN: llc < %s -march=x86-64 -mcpu=nehalem | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=nehalem | FileCheck %s -; CHECK-LABEL: a: -; CHECK: movdqu -; CHECK: pshufd define <4 x float> @a(<4 x float>* %y) nounwind { +; CHECK-LABEL: a: +; CHECK: # BB#0: +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: retq %x = load <4 x float>* %y, align 4 %a = extractelement <4 x float> %x, i32 0 %b = extractelement <4 x float> %x, i32 1 @@ -16,10 +18,12 @@ define <4 x float> @a(<4 x float>* %y) nounwind { ret <4 x float> %s } -; CHECK-LABEL: b: -; CHECK: movups -; CHECK: unpckhps define <4 x float> @b(<4 x float>* %y, <4 x float> %z) nounwind { +; CHECK-LABEL: b: +; CHECK: # BB#0: +; CHECK-NEXT: movups (%rdi), %xmm1 +; CHECK-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: retq %x = load <4 x float>* %y, align 4 %a = extractelement <4 x float> %x, i32 2 %b = extractelement <4 x float> %x, i32 3 @@ -32,10 +36,12 @@ define <4 x float> @b(<4 x float>* %y, <4 x float> %z) nounwind { ret <4 x float> %s } -; CHECK-LABEL: c: -; CHECK: movupd -; CHECK: shufpd define <2 x double> @c(<2 x double>* %y) nounwind { +; CHECK-LABEL: c: +; CHECK: # BB#0: +; CHECK-NEXT: movupd (%rdi), %xmm0 +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: retq %x = load <2 x double>* %y, align 8 %a = extractelement <2 x double> %x, i32 0 %c = extractelement <2 x double> %x, i32 1 @@ -44,10 +50,12 @@ define <2 x double> @c(<2 x double>* %y) nounwind { ret <2 x double> %r } -; CHECK-LABEL: d: -; CHECK: movupd -; CHECK: unpckhpd define <2 x double> @d(<2 x double>* %y, <2 x double> %z) nounwind { +; CHECK-LABEL: d: +; CHECK: # BB#0: +; CHECK-NEXT: movupd (%rdi), %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: retq %x = load <2 x double>* %y, align 8 %a = extractelement <2 x double> %x, i32 1 %c = extractelement <2 x double> %z, i32 1 diff --git a/test/CodeGen/X86/sse-domains.ll b/test/CodeGen/X86/sse-domains.ll index 168959a5d653..8cf522dd3c15 100644 --- a/test/CodeGen/X86/sse-domains.ll +++ b/test/CodeGen/X86/sse-domains.ll @@ -43,45 +43,3 @@ while.body: while.end: ret void } - -; CHECK: f2 -; CHECK: for.body -; -; This loop contains two cvtsi2ss instructions that update the same xmm -; register. Verify that the execution dependency fix pass breaks those -; dependencies by inserting xorps instructions. -; -; If the register allocator chooses different registers for the two cvtsi2ss -; instructions, they are still dependent on themselves. -; CHECK: xorps [[XMM1:%xmm[0-9]+]] -; CHECK: , [[XMM1]] -; CHECK: cvtsi2ssl %{{.*}}, [[XMM1]] -; CHECK: xorps [[XMM2:%xmm[0-9]+]] -; CHECK: , [[XMM2]] -; CHECK: cvtsi2ssl %{{.*}}, [[XMM2]] -; -define float @f2(i32 %m) nounwind uwtable readnone ssp { -entry: - %tobool3 = icmp eq i32 %m, 0 - br i1 %tobool3, label %for.end, label %for.body - -for.body: ; preds = %entry, %for.body - %m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ] - %s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ] - %s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ] - %n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ] - %conv = sitofp i32 %n.04 to float - %add = fadd float %s1.06, %conv - %conv1 = sitofp i32 %m.addr.07 to float - %add2 = fadd float %s2.05, %conv1 - %inc = add nsw i32 %n.04, 1 - %dec = add nsw i32 %m.addr.07, -1 - %tobool = icmp eq i32 %dec, 0 - br i1 %tobool, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - %s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] - %s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ] - %sub = fsub float %s1.0.lcssa, %s2.0.lcssa - ret float %sub -} diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll index 5122c44131a4..4dcb54ca4b0b 100644 --- a/test/CodeGen/X86/sse-minmax.ll +++ b/test/CodeGen/X86/sse-minmax.ll @@ -138,8 +138,7 @@ define double @ole_inverse(double %x, double %y) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: ogt_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ogt_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -157,8 +156,7 @@ define double @ogt_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: olt_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: olt_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -177,8 +175,7 @@ define double @olt_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: ogt_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ogt_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -198,8 +195,7 @@ define double @ogt_inverse_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: olt_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: olt_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -217,8 +213,7 @@ define double @olt_inverse_x(double %x) nounwind { ; CHECK-NEXT: andpd ; UNSAFE-LABEL: oge_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: oge_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -235,8 +230,7 @@ define double @oge_x(double %x) nounwind { ; CHECK-NEXT: andpd ; UNSAFE-LABEL: ole_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ole_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -253,8 +247,7 @@ define double @ole_x(double %x) nounwind { ; CHECK-NEXT: andnpd ; UNSAFE-LABEL: oge_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: oge_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -271,8 +264,7 @@ define double @oge_inverse_x(double %x) nounwind { ; CHECK: cmplesd %xmm ; UNSAFE-LABEL: ole_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ole_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -412,8 +404,7 @@ define double @ule_inverse(double %x, double %y) nounwind { ; CHECK-NEXT: andpd ; UNSAFE-LABEL: ugt_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ugt_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -430,8 +421,7 @@ define double @ugt_x(double %x) nounwind { ; CHECK-NEXT: andpd ; UNSAFE-LABEL: ult_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ult_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -448,8 +438,7 @@ define double @ult_x(double %x) nounwind { ; CHECK-NEXT: andnpd ; UNSAFE-LABEL: ugt_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ugt_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -467,8 +456,7 @@ define double @ugt_inverse_x(double %x) nounwind { ; CHECK-NEXT: andnpd ; UNSAFE-LABEL: ult_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ult_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -488,8 +476,7 @@ define double @ult_inverse_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: uge_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: uge_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -508,8 +495,7 @@ define double @uge_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: ule_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ule_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -527,8 +513,7 @@ define double @ule_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: uge_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: minsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: minsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: uge_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -547,8 +532,7 @@ define double @uge_inverse_x(double %x) nounwind { ; CHECK-NEXT: ret ; UNSAFE-LABEL: ule_inverse_x: ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 -; UNSAFE-NEXT: maxsd %xmm0, %xmm1 -; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0 +; UNSAFE-NEXT: maxsd %xmm1, %xmm0 ; UNSAFE-NEXT: ret ; FINITE-LABEL: ule_inverse_x: ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1 @@ -819,11 +803,18 @@ define double @ule_inverse_y(double %x) nounwind { ; Test a few more misc. cases. ; CHECK-LABEL: clampTo3k_a: -; CHECK: minsd +; CHECK-NEXT: movsd {{[^,]*}}, %xmm1 +; CHECK-NEXT: minsd %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: ret ; UNSAFE-LABEL: clampTo3k_a: -; UNSAFE: minsd +; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0 +; UNSAFE-NEXT: ret ; FINITE-LABEL: clampTo3k_a: -; FINITE: minsd +; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 +; FINITE-NEXT: minsd %xmm0, %xmm1 +; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: ret define double @clampTo3k_a(double %x) nounwind readnone { entry: %0 = fcmp ogt double %x, 3.000000e+03 ; <i1> [#uses=1] @@ -832,11 +823,16 @@ entry: } ; CHECK-LABEL: clampTo3k_b: -; CHECK: minsd +; CHECK-NEXT: minsd {{[^,]*}}, %xmm0 +; CHECK-NEXT: ret ; UNSAFE-LABEL: clampTo3k_b: -; UNSAFE: minsd +; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0 +; UNSAFE-NEXT: ret ; FINITE-LABEL: clampTo3k_b: -; FINITE: minsd +; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 +; FINITE-NEXT: minsd %xmm0, %xmm1 +; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: ret define double @clampTo3k_b(double %x) nounwind readnone { entry: %0 = fcmp uge double %x, 3.000000e+03 ; <i1> [#uses=1] @@ -845,11 +841,18 @@ entry: } ; CHECK-LABEL: clampTo3k_c: -; CHECK: maxsd +; CHECK-NEXT: movsd {{[^,]*}}, %xmm1 +; CHECK-NEXT: maxsd %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: ret ; UNSAFE-LABEL: clampTo3k_c: -; UNSAFE: maxsd +; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0 +; UNSAFE-NEXT: ret ; FINITE-LABEL: clampTo3k_c: -; FINITE: maxsd +; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 +; FINITE-NEXT: maxsd %xmm0, %xmm1 +; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: ret define double @clampTo3k_c(double %x) nounwind readnone { entry: %0 = fcmp olt double %x, 3.000000e+03 ; <i1> [#uses=1] @@ -858,11 +861,16 @@ entry: } ; CHECK-LABEL: clampTo3k_d: -; CHECK: maxsd +; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0 +; CHECK-NEXT: ret ; UNSAFE-LABEL: clampTo3k_d: -; UNSAFE: maxsd +; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0 +; UNSAFE-NEXT: ret ; FINITE-LABEL: clampTo3k_d: -; FINITE: maxsd +; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 +; FINITE-NEXT: maxsd %xmm0, %xmm1 +; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: ret define double @clampTo3k_d(double %x) nounwind readnone { entry: %0 = fcmp ule double %x, 3.000000e+03 ; <i1> [#uses=1] @@ -871,11 +879,18 @@ entry: } ; CHECK-LABEL: clampTo3k_e: -; CHECK: maxsd +; CHECK-NEXT: movsd {{[^,]*}}, %xmm1 +; CHECK-NEXT: maxsd %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: ret ; UNSAFE-LABEL: clampTo3k_e: -; UNSAFE: maxsd +; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0 +; UNSAFE-NEXT: ret ; FINITE-LABEL: clampTo3k_e: -; FINITE: maxsd +; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 +; FINITE-NEXT: maxsd %xmm0, %xmm1 +; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: ret define double @clampTo3k_e(double %x) nounwind readnone { entry: %0 = fcmp olt double %x, 3.000000e+03 ; <i1> [#uses=1] @@ -884,11 +899,16 @@ entry: } ; CHECK-LABEL: clampTo3k_f: -; CHECK: maxsd +; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0 +; CHECK-NEXT: ret ; UNSAFE-LABEL: clampTo3k_f: -; UNSAFE: maxsd +; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0 +; UNSAFE-NEXT: ret ; FINITE-LABEL: clampTo3k_f: -; FINITE: maxsd +; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 +; FINITE-NEXT: maxsd %xmm0, %xmm1 +; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: ret define double @clampTo3k_f(double %x) nounwind readnone { entry: %0 = fcmp ule double %x, 3.000000e+03 ; <i1> [#uses=1] @@ -897,11 +917,18 @@ entry: } ; CHECK-LABEL: clampTo3k_g: -; CHECK: minsd +; CHECK-NEXT: movsd {{[^,]*}}, %xmm1 +; CHECK-NEXT: minsd %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: ret ; UNSAFE-LABEL: clampTo3k_g: -; UNSAFE: minsd +; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0 +; UNSAFE-NEXT: ret ; FINITE-LABEL: clampTo3k_g: -; FINITE: minsd +; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 +; FINITE-NEXT: minsd %xmm0, %xmm1 +; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: ret define double @clampTo3k_g(double %x) nounwind readnone { entry: %0 = fcmp ogt double %x, 3.000000e+03 ; <i1> [#uses=1] @@ -910,11 +937,16 @@ entry: } ; CHECK-LABEL: clampTo3k_h: -; CHECK: minsd +; CHECK-NEXT: minsd {{[^,]*}}, %xmm0 +; CHECK-NEXT: ret ; UNSAFE-LABEL: clampTo3k_h: -; UNSAFE: minsd +; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0 +; UNSAFE-NEXT: ret ; FINITE-LABEL: clampTo3k_h: -; FINITE: minsd +; FINITE-NEXT: movsd {{[^,]*}}, %xmm1 +; FINITE-NEXT: minsd %xmm0, %xmm1 +; FINITE-NEXT: movaps %xmm1, %xmm0 +; FINITE-NEXT: ret define double @clampTo3k_h(double %x) nounwind readnone { entry: %0 = fcmp uge double %x, 3.000000e+03 ; <i1> [#uses=1] @@ -923,33 +955,73 @@ entry: } ; UNSAFE-LABEL: test_maxpd: -; UNSAFE: maxpd -define <2 x double> @test_maxpd(<2 x double> %x, <2 x double> %y) { +; UNSAFE-NEXT: maxpd %xmm1, %xmm0 +; UNSAFE-NEXT: ret +define <2 x double> @test_maxpd(<2 x double> %x, <2 x double> %y) nounwind { %max_is_x = fcmp oge <2 x double> %x, %y %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y ret <2 x double> %max } ; UNSAFE-LABEL: test_minpd: -; UNSAFE: minpd -define <2 x double> @test_minpd(<2 x double> %x, <2 x double> %y) { +; UNSAFE-NEXT: minpd %xmm1, %xmm0 +; UNSAFE-NEXT: ret +define <2 x double> @test_minpd(<2 x double> %x, <2 x double> %y) nounwind { %min_is_x = fcmp ole <2 x double> %x, %y %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y ret <2 x double> %min } ; UNSAFE-LABEL: test_maxps: -; UNSAFE: maxps -define <4 x float> @test_maxps(<4 x float> %x, <4 x float> %y) { +; UNSAFE-NEXT: maxps %xmm1, %xmm0 +; UNSAFE-NEXT: ret +define <4 x float> @test_maxps(<4 x float> %x, <4 x float> %y) nounwind { %max_is_x = fcmp oge <4 x float> %x, %y %max = select <4 x i1> %max_is_x, <4 x float> %x, <4 x float> %y ret <4 x float> %max } ; UNSAFE-LABEL: test_minps: -; UNSAFE: minps -define <4 x float> @test_minps(<4 x float> %x, <4 x float> %y) { +; UNSAFE-NEXT: minps %xmm1, %xmm0 +; UNSAFE-NEXT: ret +define <4 x float> @test_minps(<4 x float> %x, <4 x float> %y) nounwind { %min_is_x = fcmp ole <4 x float> %x, %y %min = select <4 x i1> %min_is_x, <4 x float> %x, <4 x float> %y ret <4 x float> %min } + +; UNSAFE-LABEL: test_maxps_illegal_v2f32: +; UNSAFE-NEXT: maxps %xmm1, %xmm0 +; UNSAFE-NEXT: ret +define <2 x float> @test_maxps_illegal_v2f32(<2 x float> %x, <2 x float> %y) nounwind { + %max_is_x = fcmp oge <2 x float> %x, %y + %max = select <2 x i1> %max_is_x, <2 x float> %x, <2 x float> %y + ret <2 x float> %max +} + +; UNSAFE-LABEL: test_minps_illegal_v2f32: +; UNSAFE-NEXT: minps %xmm1, %xmm0 +; UNSAFE-NEXT: ret +define <2 x float> @test_minps_illegal_v2f32(<2 x float> %x, <2 x float> %y) nounwind { + %min_is_x = fcmp ole <2 x float> %x, %y + %min = select <2 x i1> %min_is_x, <2 x float> %x, <2 x float> %y + ret <2 x float> %min +} + +; UNSAFE-LABEL: test_maxps_illegal_v3f32: +; UNSAFE-NEXT: maxps %xmm1, %xmm0 +; UNSAFE-NEXT: ret +define <3 x float> @test_maxps_illegal_v3f32(<3 x float> %x, <3 x float> %y) nounwind { + %max_is_x = fcmp oge <3 x float> %x, %y + %max = select <3 x i1> %max_is_x, <3 x float> %x, <3 x float> %y + ret <3 x float> %max +} + +; UNSAFE-LABEL: test_minps_illegal_v3f32: +; UNSAFE-NEXT: minps %xmm1, %xmm0 +; UNSAFE-NEXT: ret +define <3 x float> @test_minps_illegal_v3f32(<3 x float> %x, <3 x float> %y) nounwind { + %min_is_x = fcmp ole <3 x float> %x, %y + %min = select <3 x i1> %min_is_x, <3 x float> %x, <3 x float> %y + ret <3 x float> %min +} diff --git a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll b/test/CodeGen/X86/sse-scalar-fp-arith-2.ll deleted file mode 100644 index 600ee1b7b1e5..000000000000 --- a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll +++ /dev/null @@ -1,423 +0,0 @@ -; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s -; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s -; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s - -; Ensure that the backend selects SSE/AVX scalar fp instructions -; from a packed fp instrution plus a vector insert. - - -define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { - %1 = fadd <4 x float> %a, %b - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test_add_ss -; SSE2: addss %xmm1, %xmm0 -; AVX: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { - %1 = fsub <4 x float> %a, %b - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test_sub_ss -; SSE2: subss %xmm1, %xmm0 -; AVX: vsubss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { - %1 = fmul <4 x float> %a, %b - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test_mul_ss -; SSE2: mulss %xmm1, %xmm0 -; AVX: vmulss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { - %1 = fdiv <4 x float> %a, %b - %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test_div_ss -; SSE2: divss %xmm1, %xmm0 -; AVX: vdivss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { - %1 = fadd <2 x double> %a, %b - %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test_add_sd -; SSE2: addsd %xmm1, %xmm0 -; AVX: vaddsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { - %1 = fsub <2 x double> %a, %b - %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test_sub_sd -; SSE2: subsd %xmm1, %xmm0 -; AVX: vsubsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { - %1 = fmul <2 x double> %a, %b - %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test_mul_sd -; SSE2: mulsd %xmm1, %xmm0 -; AVX: vmulsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { - %1 = fdiv <2 x double> %a, %b - %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test_div_sd -; SSE2: divsd %xmm1, %xmm0 -; AVX: vdivsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) { - %1 = fadd <4 x float> %b, %a - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test2_add_ss -; SSE2: addss %xmm0, %xmm1 -; AVX: vaddss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) { - %1 = fsub <4 x float> %b, %a - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test2_sub_ss -; SSE2: subss %xmm0, %xmm1 -; AVX: vsubss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) { - %1 = fmul <4 x float> %b, %a - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test2_mul_ss -; SSE2: mulss %xmm0, %xmm1 -; AVX: vmulss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) { - %1 = fdiv <4 x float> %b, %a - %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> - ret <4 x float> %2 -} - -; CHECK-LABEL: test2_div_ss -; SSE2: divss %xmm0, %xmm1 -; AVX: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) { - %1 = fadd <2 x double> %b, %a - %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test2_add_sd -; SSE2: addsd %xmm0, %xmm1 -; AVX: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) { - %1 = fsub <2 x double> %b, %a - %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test2_sub_sd -; SSE2: subsd %xmm0, %xmm1 -; AVX: vsubsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) { - %1 = fmul <2 x double> %b, %a - %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test2_mul_sd -; SSE2: mulsd %xmm0, %xmm1 -; AVX: vmulsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) { - %1 = fdiv <2 x double> %b, %a - %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> - ret <2 x double> %2 -} - -; CHECK-LABEL: test2_div_sd -; SSE2: divsd %xmm0, %xmm1 -; AVX: vdivsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <4 x float> @test3_add_ss(<4 x float> %a, <4 x float> %b) { - %1 = fadd <4 x float> %a, %b - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test3_add_ss -; SSE2: addss %xmm1, %xmm0 -; AVX: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test3_sub_ss(<4 x float> %a, <4 x float> %b) { - %1 = fsub <4 x float> %a, %b - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test3_sub_ss -; SSE2: subss %xmm1, %xmm0 -; AVX: vsubss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test3_mul_ss(<4 x float> %a, <4 x float> %b) { - %1 = fmul <4 x float> %a, %b - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test3_mul_ss -; SSE2: mulss %xmm1, %xmm0 -; AVX: vmulss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test3_div_ss(<4 x float> %a, <4 x float> %b) { - %1 = fdiv <4 x float> %a, %b - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test3_div_ss -; SSE2: divss %xmm1, %xmm0 -; AVX: vdivss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <2 x double> @test3_add_sd(<2 x double> %a, <2 x double> %b) { - %1 = fadd <2 x double> %a, %b - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test3_add_sd -; SSE2: addsd %xmm1, %xmm0 -; AVX: vaddsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test3_sub_sd(<2 x double> %a, <2 x double> %b) { - %1 = fsub <2 x double> %a, %b - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test3_sub_sd -; SSE2: subsd %xmm1, %xmm0 -; AVX: vsubsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test3_mul_sd(<2 x double> %a, <2 x double> %b) { - %1 = fmul <2 x double> %a, %b - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test3_mul_sd -; SSE2: mulsd %xmm1, %xmm0 -; AVX: vmulsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test3_div_sd(<2 x double> %a, <2 x double> %b) { - %1 = fdiv <2 x double> %a, %b - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test3_div_sd -; SSE2: divsd %xmm1, %xmm0 -; AVX: vdivsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <4 x float> @test4_add_ss(<4 x float> %a, <4 x float> %b) { - %1 = fadd <4 x float> %b, %a - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test4_add_ss -; SSE2: addss %xmm0, %xmm1 -; AVX: vaddss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test4_sub_ss(<4 x float> %a, <4 x float> %b) { - %1 = fsub <4 x float> %b, %a - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test4_sub_ss -; SSE2: subss %xmm0, %xmm1 -; AVX: vsubss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test4_mul_ss(<4 x float> %a, <4 x float> %b) { - %1 = fmul <4 x float> %b, %a - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test4_mul_ss -; SSE2: mulss %xmm0, %xmm1 -; AVX: vmulss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <4 x float> @test4_div_ss(<4 x float> %a, <4 x float> %b) { - %1 = fdiv <4 x float> %b, %a - %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 - ret <4 x float> %2 -} - -; CHECK-LABEL: test4_div_ss -; SSE2: divss %xmm0, %xmm1 -; AVX: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - -define <2 x double> @test4_add_sd(<2 x double> %a, <2 x double> %b) { - %1 = fadd <2 x double> %b, %a - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test4_add_sd -; SSE2: addsd %xmm0, %xmm1 -; AVX: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test4_sub_sd(<2 x double> %a, <2 x double> %b) { - %1 = fsub <2 x double> %b, %a - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test4_sub_sd -; SSE2: subsd %xmm0, %xmm1 -; AVX: vsubsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test4_mul_sd(<2 x double> %a, <2 x double> %b) { - %1 = fmul <2 x double> %b, %a - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test4_mul_sd -; SSE2: mulsd %xmm0, %xmm1 -; AVX: vmulsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - -define <2 x double> @test4_div_sd(<2 x double> %a, <2 x double> %b) { - %1 = fdiv <2 x double> %b, %a - %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 - ret <2 x double> %2 -} - -; CHECK-LABEL: test4_div_sd -; SSE2: divsd %xmm0, %xmm1 -; AVX: vdivsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll index 3949a835e67a..b122ef67544c 100644 --- a/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -1,13 +1,23 @@ -; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s -; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s -; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s +; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s +; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s +; RUN: llc -mcpu=x86-64 -mattr=+avx < %s | FileCheck --check-prefix=AVX %s + +target triple = "x86_64-unknown-unknown" ; Ensure that the backend no longer emits unnecessary vector insert ; instructions immediately after SSE scalar fp instructions ; like addss or mulss. - define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %add = fadd float %2, %1 @@ -15,14 +25,16 @@ define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_add_ss -; SSE2: addss %xmm1, %xmm0 -; AVX: vaddss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %sub = fsub float %2, %1 @@ -30,13 +42,16 @@ define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_sub_ss -; SSE2: subss %xmm1, %xmm0 -; AVX: vsubss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %mul = fmul float %2, %1 @@ -44,14 +59,16 @@ define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_mul_ss -; SSE2: mulss %xmm1, %xmm0 -; AVX: vmulss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_div_ss: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %div = fdiv float %2, %1 @@ -59,14 +76,16 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_div_ss -; SSE2: divss %xmm1, %xmm0 -; AVX: vdivss %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test_add_sd: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_add_sd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %b, i32 0 %2 = extractelement <2 x double> %a, i32 0 %add = fadd double %2, %1 @@ -74,14 +93,16 @@ define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test_add_sd -; SSE2: addsd %xmm1, %xmm0 -; AVX: vaddsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test_sub_sd: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_sub_sd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %b, i32 0 %2 = extractelement <2 x double> %a, i32 0 %sub = fsub double %2, %1 @@ -89,14 +110,16 @@ define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test_sub_sd -; SSE2: subsd %xmm1, %xmm0 -; AVX: vsubsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test_mul_sd: +; SSE: # BB#0: +; SSE-NEXT: mulsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_mul_sd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %b, i32 0 %2 = extractelement <2 x double> %a, i32 0 %mul = fmul double %2, %1 @@ -104,14 +127,16 @@ define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test_mul_sd -; SSE2: mulsd %xmm1, %xmm0 -; AVX: vmulsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test_div_sd: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_div_sd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %b, i32 0 %2 = extractelement <2 x double> %a, i32 0 %div = fdiv double %2, %1 @@ -119,14 +144,17 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test_div_sd -; SSE2: divsd %xmm1, %xmm0 -; AVX: vdivsd %xmm1, %xmm0, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test2_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %a, i32 0 %2 = extractelement <4 x float> %b, i32 0 %add = fadd float %1, %2 @@ -134,14 +162,17 @@ define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test2_add_ss -; SSE2: addss %xmm0, %xmm1 -; AVX: vaddss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test2_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %a, i32 0 %2 = extractelement <4 x float> %b, i32 0 %sub = fsub float %2, %1 @@ -149,14 +180,17 @@ define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test2_sub_ss -; SSE2: subss %xmm0, %xmm1 -; AVX: vsubss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test2_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %a, i32 0 %2 = extractelement <4 x float> %b, i32 0 %mul = fmul float %1, %2 @@ -164,14 +198,17 @@ define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test2_mul_ss -; SSE2: mulss %xmm0, %xmm1 -; AVX: vmulss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test2_div_ss: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %a, i32 0 %2 = extractelement <4 x float> %b, i32 0 %div = fdiv float %2, %1 @@ -179,14 +216,17 @@ define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test2_div_ss -; SSE2: divss %xmm0, %xmm1 -; AVX: vdivss %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movss -; CHECK: ret - - define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test2_add_sd: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_add_sd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %b, i32 0 %add = fadd double %1, %2 @@ -194,14 +234,17 @@ define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test2_add_sd -; SSE2: addsd %xmm0, %xmm1 -; AVX: vaddsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test2_sub_sd: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_sub_sd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %b, i32 0 %sub = fsub double %2, %1 @@ -209,14 +252,17 @@ define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test2_sub_sd -; SSE2: subsd %xmm0, %xmm1 -; AVX: vsubsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test2_mul_sd: +; SSE: # BB#0: +; SSE-NEXT: mulsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_mul_sd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %b, i32 0 %mul = fmul double %1, %2 @@ -224,14 +270,17 @@ define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test2_mul_sd -; SSE2: mulsd %xmm0, %xmm1 -; AVX: vmulsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: test2_div_sd: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test2_div_sd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = extractelement <2 x double> %a, i32 0 %2 = extractelement <2 x double> %b, i32 0 %div = fdiv double %2, %1 @@ -239,14 +288,18 @@ define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) { ret <2 x double> %3 } -; CHECK-LABEL: test2_div_sd -; SSE2: divsd %xmm0, %xmm1 -; AVX: vdivsd %xmm0, %xmm1, %xmm0 -; CHECK-NOT: movsd -; CHECK: ret - - define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_multiple_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm0, %xmm1 +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_multiple_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %add = fadd float %2, %1 @@ -255,14 +308,19 @@ define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_multiple_add_ss -; CHECK: addss -; CHECK: addss -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_multiple_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: subss %xmm1, %xmm2 +; SSE-NEXT: subss %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_multiple_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %sub = fsub float %2, %1 @@ -271,14 +329,18 @@ define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_multiple_sub_ss -; CHECK: subss -; CHECK: subss -; CHECK-NOT: movss -; CHECK: ret - - define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_multiple_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm0, %xmm1 +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_multiple_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %mul = fmul float %2, %1 @@ -287,13 +349,19 @@ define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_multiple_mul_ss -; CHECK: mulss -; CHECK: mulss -; CHECK-NOT: movss -; CHECK: ret - define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: test_multiple_div_ss: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: divss %xmm1, %xmm2 +; SSE-NEXT: divss %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_multiple_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = extractelement <4 x float> %b, i32 0 %2 = extractelement <4 x float> %a, i32 0 %div = fdiv float %2, %1 @@ -302,9 +370,501 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) { ret <4 x float> %3 } -; CHECK-LABEL: test_multiple_div_ss -; CHECK: divss -; CHECK: divss -; CHECK-NOT: movss -; CHECK: ret +; Ensure that the backend selects SSE/AVX scalar fp instructions +; from a packed fp instrution plus a vector insert. + +define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fadd <4 x float> %a, %b + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fsub <4 x float> %a, %b + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fmul <4 x float> %a, %b + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test_div_ss: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <4 x float> %a, %b + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test_add_sd: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_add_sd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fadd <2 x double> %a, %b + %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test_sub_sd: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_sub_sd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fsub <2 x double> %a, %b + %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test_mul_sd: +; SSE: # BB#0: +; SSE-NEXT: mulsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_mul_sd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fmul <2 x double> %a, %b + %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test_div_sd: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test_div_sd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <2 x double> %a, %b + %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test2_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fadd <4 x float> %b, %a + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test2_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fsub <4 x float> %b, %a + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test2_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fmul <4 x float> %b, %a + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test2_div_ss: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <4 x float> %b, %a + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %2 +} + +define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test2_add_sd: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_add_sd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fadd <2 x double> %b, %a + %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test2_sub_sd: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_sub_sd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fsub <2 x double> %b, %a + %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test2_mul_sd: +; SSE: # BB#0: +; SSE-NEXT: mulsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_mul_sd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fmul <2 x double> %b, %a + %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test2_div_sd: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test2_div_sd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <2 x double> %b, %a + %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %2 +} + +define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test3_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fadd <4 x float> %a, %b + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 + ret <4 x float> %2 +} + +define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test3_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fsub <4 x float> %a, %b + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 + ret <4 x float> %2 +} + +define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test3_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fmul <4 x float> %a, %b + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 + ret <4 x float> %2 +} +define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test3_div_ss: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <4 x float> %a, %b + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1 + ret <4 x float> %2 +} + +define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test3_add_sd: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_add_sd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fadd <2 x double> %a, %b + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 + ret <2 x double> %2 +} + +define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test3_sub_sd: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_sub_sd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fsub <2 x double> %a, %b + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 + ret <2 x double> %2 +} + +define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test3_mul_sd: +; SSE: # BB#0: +; SSE-NEXT: mulsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_mul_sd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fmul <2 x double> %a, %b + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 + ret <2 x double> %2 +} + +define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test3_div_sd: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test3_div_sd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <2 x double> %a, %b + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1 + ret <2 x double> %2 +} + +define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test4_add_ss: +; SSE: # BB#0: +; SSE-NEXT: addss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_add_ss: +; AVX: # BB#0: +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fadd <4 x float> %b, %a + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 + ret <4 x float> %2 +} + +define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test4_sub_ss: +; SSE: # BB#0: +; SSE-NEXT: subss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_sub_ss: +; AVX: # BB#0: +; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fsub <4 x float> %b, %a + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 + ret <4 x float> %2 +} + +define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test4_mul_ss: +; SSE: # BB#0: +; SSE-NEXT: mulss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_mul_ss: +; AVX: # BB#0: +; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fmul <4 x float> %b, %a + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 + ret <4 x float> %2 +} + +define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: insert_test4_div_ss: +; SSE: # BB#0: +; SSE-NEXT: divss %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_div_ss: +; AVX: # BB#0: +; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <4 x float> %b, %a + %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1 + ret <4 x float> %2 +} + +define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test4_add_sd: +; SSE: # BB#0: +; SSE-NEXT: addsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_add_sd: +; AVX: # BB#0: +; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fadd <2 x double> %b, %a + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 + ret <2 x double> %2 +} + +define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test4_sub_sd: +; SSE: # BB#0: +; SSE-NEXT: subsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_sub_sd: +; AVX: # BB#0: +; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fsub <2 x double> %b, %a + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 + ret <2 x double> %2 +} + +define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test4_mul_sd: +; SSE: # BB#0: +; SSE-NEXT: mulsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_mul_sd: +; AVX: # BB#0: +; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fmul <2 x double> %b, %a + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 + ret <2 x double> %2 +} + +define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) { +; SSE-LABEL: insert_test4_div_sd: +; SSE: # BB#0: +; SSE-NEXT: divsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_test4_div_sd: +; AVX: # BB#0: +; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = fdiv <2 x double> %b, %a + %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1 + ret <2 x double> %2 +} diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll index 183297e4c306..fd35e75d71ae 100644 --- a/test/CodeGen/X86/sse1.ll +++ b/test/CodeGen/X86/sse1.ll @@ -1,17 +1,6 @@ ; Tests for SSE1 and below, without SSE2+. -; RUN: llc < %s -march=x86 -mcpu=pentium3 -O3 | FileCheck %s -; RUN: llc < %s -march=x86-64 -mattr=-sse2,+sse -O3 | FileCheck %s - -define <8 x i16> @test1(<8 x i32> %a) nounwind { -; CHECK: test1 - ret <8 x i16> zeroinitializer -} - -define <8 x i16> @test2(<8 x i32> %a) nounwind { -; CHECK: test2 - %c = trunc <8 x i32> %a to <8 x i16> ; <<8 x i16>> [#uses=1] - ret <8 x i16> %c -} +; RUN: llc < %s -mtriple=i386-unknown-unknown -march=x86 -mcpu=pentium3 -O3 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=-sse2,+sse -O3 | FileCheck %s ; PR7993 ;define <4 x i32> @test3(<4 x i16> %a) nounwind { @@ -23,6 +12,15 @@ define <8 x i16> @test2(<8 x i32> %a) nounwind { ; vector that this ends up returning. ; rdar://8368414 define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind { +; CHECK-LABEL: test4: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; CHECK-NEXT: addss %xmm1, %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; CHECK-NEXT: subss %xmm1, %xmm2 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: ret entry: %tmp7 = extractelement <2 x float> %A, i32 0 %tmp5 = extractelement <2 x float> %A, i32 1 @@ -33,15 +31,6 @@ entry: %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1 ret <2 x float> %tmp9 -; CHECK-LABEL: test4: -; CHECK-NOT: shufps $16 -; CHECK: shufps $1, -; CHECK-NOT: shufps $16 -; CHECK: shufps $1, -; CHECK-NOT: shufps $16 -; CHECK: unpcklps -; CHECK-NOT: shufps $16 -; CHECK: ret } ; We used to get stuck in type legalization for this example when lowering the @@ -50,8 +39,9 @@ entry: ; condition operand and widening the resulting vselect for the v4f32 result. ; PR18036 -; CHECK-LABEL: vselect define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { +; CHECK-LABEL: vselect: +; CHECK: ret entry: %a1 = icmp eq <4 x i32> %q, zeroinitializer %a14 = select <4 x i1> %a1, <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+0> , <4 x float> zeroinitializer diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll deleted file mode 100644 index c63ff72b4801..000000000000 --- a/test/CodeGen/X86/sse2-blend.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s - -; CHECK-LABEL: vsel_float -; CHECK-NOT: xorps -; CHECK: movss -; CHECK-NOT: orps -; CHECK: ret -define void@vsel_float(<4 x float>* %v1, <4 x float>* %v2) { - %A = load <4 x float>* %v1 - %B = load <4 x float>* %v2 - %vsel = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %A, <4 x float> %B - store <4 x float > %vsel, <4 x float>* %v1 - ret void -} - -; CHECK-LABEL: vsel_i32 -; CHECK-NOT: xorps -; CHECK: movss -; CHECK-NOT: orps -; CHECK: ret -define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) { - %A = load <4 x i32>* %v1 - %B = load <4 x i32>* %v2 - %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %A, <4 x i32> %B - store <4 x i32 > %vsel, <4 x i32>* %v1 - ret void -} - -; Without forcing instructions, fall back to the preferred PS domain. -; CHECK-LABEL: vsel_i64 -; CHECK: andnps -; CHECK: orps -; CHECK: ret - -define void@vsel_i64(<2 x i64>* %v1, <2 x i64>* %v2) { - %A = load <2 x i64>* %v1 - %B = load <2 x i64>* %v2 - %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %A, <2 x i64> %B - store <2 x i64 > %vsel, <2 x i64>* %v1 - ret void -} - -; Without forcing instructions, fall back to the preferred PS domain. -; CHECK-LABEL: vsel_double -; CHECK: andnps -; CHECK: orps -; CHECK: ret - -define void@vsel_double(<2 x double>* %v1, <2 x double>* %v2) { - %A = load <2 x double>* %v1 - %B = load <2 x double>* %v2 - %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %A, <2 x double> %B - store <2 x double > %vsel, <2 x double>* %v1 - ret void -} - - diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll index c906ecdd60c1..ddb04211ec7b 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -411,7 +411,7 @@ declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) { - ; CHECK: pslldq + ; CHECK: pslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -419,7 +419,7 @@ declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { - ; CHECK: pslldq + ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8] %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -507,7 +507,7 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) { - ; CHECK: psrldq + ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } @@ -515,7 +515,7 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) { - ; CHECK: psrldq + ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] ret <2 x i64> %res } diff --git a/test/CodeGen/X86/sse2-mul.ll b/test/CodeGen/X86/sse2-mul.ll deleted file mode 100644 index e066368dc73e..000000000000 --- a/test/CodeGen/X86/sse2-mul.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s - -define <4 x i32> @test1(<4 x i32> %x, <4 x i32> %y) { - %m = mul <4 x i32> %x, %y - ret <4 x i32> %m -; CHECK-LABEL: test1: -; CHECK: pshufd $49 -; CHECK: pmuludq -; CHECK: pshufd $49 -; CHECK: pmuludq -; CHECK: shufps $-120 -; CHECK: pshufd $-40 -; CHECK: ret -} diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll index e8d3d6f19ed7..b7db6cb56ef2 100644 --- a/test/CodeGen/X86/sse2.ll +++ b/test/CodeGen/X86/sse2.ll @@ -2,39 +2,48 @@ ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=pentium4 -O3 | FileCheck %s define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { +; CHECK-LABEL: test1: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movapd (%ecx), %xmm0 +; CHECK-NEXT: movlpd {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: movapd %xmm0, (%eax) +; CHECK-NEXT: retl %tmp3 = load <2 x double>* %A, align 16 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 > store <2 x double> %tmp9, <2 x double>* %r, align 16 ret void - -; CHECK-LABEL: test1: -; CHECK: movl 4(%esp), %eax -; CHECK-NEXT: movl 8(%esp), %ecx -; CHECK-NEXT: movapd (%ecx), %xmm0 -; CHECK-NEXT: movlpd 12(%esp), %xmm0 -; CHECK-NEXT: movapd %xmm0, (%eax) -; CHECK-NEXT: ret } define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind { +; CHECK-LABEL: test2: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movapd (%ecx), %xmm0 +; CHECK-NEXT: movhpd {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: movapd %xmm0, (%eax) +; CHECK-NEXT: retl %tmp3 = load <2 x double>* %A, align 16 %tmp7 = insertelement <2 x double> undef, double %B, i32 0 %tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 > store <2 x double> %tmp9, <2 x double>* %r, align 16 ret void - -; CHECK-LABEL: test2: -; CHECK: movl 4(%esp), %eax -; CHECK: movl 8(%esp), %ecx -; CHECK-NEXT: movapd (%ecx), %xmm0 -; CHECK-NEXT: movhpd 12(%esp), %xmm0 -; CHECK-NEXT: movapd %xmm0, (%eax) -; CHECK-NEXT: ret } define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind { +; CHECK-LABEL: test3: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movaps (%edx), %xmm0 +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps %xmm0, (%eax) +; CHECK-NEXT: retl %tmp = load <4 x float>* %B ; <<4 x float>> [#uses=2] %tmp3 = load <4 x float>* %A ; <<4 x float>> [#uses=2] %tmp.upgrd.1 = extractelement <4 x float> %tmp3, i32 0 ; <float> [#uses=1] @@ -47,24 +56,30 @@ define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 ; <<4 x float>> [#uses=1] store <4 x float> %tmp13, <4 x float>* %res ret void -; CHECK: @test3 -; CHECK: unpcklps } define void @test4(<4 x float> %X, <4 x float>* %res) nounwind { +; CHECK-LABEL: test4: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; CHECK-NEXT: movaps %xmm0, (%eax) +; CHECK-NEXT: retl %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] store <4 x float> %tmp5, <4 x float>* %res ret void -; CHECK: @test4 -; CHECK: pshufd $50, %xmm0, %xmm0 } define <4 x i32> @test5(i8** %ptr) nounwind { ; CHECK-LABEL: test5: -; CHECK: pxor -; CHECK: punpcklbw -; CHECK: punpcklwd - +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl (%eax), %eax +; CHECK-NEXT: movss (%eax), %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: retl %tmp = load i8** %ptr ; <i8*> [#uses=1] %tmp.upgrd.1 = bitcast i8* %tmp to float* ; <float*> [#uses=1] %tmp.upgrd.2 = load float* %tmp.upgrd.1 ; <float> [#uses=1] @@ -81,30 +96,39 @@ define <4 x i32> @test5(i8** %ptr) nounwind { } define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind { - %tmp1 = load <4 x float>* %A ; <<4 x float>> [#uses=1] - %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1] - store <4 x float> %tmp2, <4 x float>* %res - ret void - ; CHECK-LABEL: test6: -; CHECK: movaps (%ecx), %xmm0 -; CHECK: movaps %xmm0, (%eax) +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movaps (%ecx), %xmm0 +; CHECK-NEXT: movaps %xmm0, (%eax) +; CHECK-NEXT: retl + %tmp1 = load <4 x float>* %A ; <<4 x float>> [#uses=1] + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 > ; <<4 x float>> [#uses=1] + store <4 x float> %tmp2, <4 x float>* %res + ret void } define void @test7() nounwind { - bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1] - shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1] - store <4 x float> %2, <4 x float>* null - ret void - ; CHECK-LABEL: test7: -; CHECK: xorps %xmm0, %xmm0 -; CHECK: movaps %xmm0, 0 +; CHECK: ## BB#0: +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, 0 +; CHECK-NEXT: retl + bitcast <4 x i32> zeroinitializer to <4 x float> ; <<4 x float>>:1 [#uses=1] + shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer ; <<4 x float>>:2 [#uses=1] + store <4 x float> %2, <4 x float>* null + ret void } @x = external global [4 x i32] define <2 x i64> @test8() nounwind { +; CHECK-LABEL: test8: +; CHECK: ## BB#0: +; CHECK-NEXT: movl L_x$non_lazy_ptr, %eax +; CHECK-NEXT: movups (%eax), %xmm0 +; CHECK-NEXT: retl %tmp = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1] %tmp3 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 1) ; <i32> [#uses=1] %tmp5 = load i32* getelementptr ([4 x i32]* @x, i32 0, i32 2) ; <i32> [#uses=1] @@ -115,90 +139,123 @@ define <2 x i64> @test8() nounwind { %tmp15 = insertelement <4 x i32> %tmp14, i32 %tmp7, i32 3 ; <<4 x i32>> [#uses=1] %tmp16 = bitcast <4 x i32> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1] ret <2 x i64> %tmp16 -; CHECK-LABEL: test8: -; CHECK: movups (%eax), %xmm0 } define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind { +; CHECK-LABEL: test9: +; CHECK: ## BB#0: +; CHECK-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: retl %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1] ret <4 x float> %tmp13 -; CHECK-LABEL: test9: -; CHECK: movups 8(%esp), %xmm0 } define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind { +; CHECK-LABEL: test10: +; CHECK: ## BB#0: +; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: retl %tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1] %tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] %tmp12 = insertelement <4 x float> %tmp11, float %c, i32 2 ; <<4 x float>> [#uses=1] %tmp13 = insertelement <4 x float> %tmp12, float %d, i32 3 ; <<4 x float>> [#uses=1] ret <4 x float> %tmp13 -; CHECK-LABEL: test10: -; CHECK: movaps 4(%esp), %xmm0 } define <2 x double> @test11(double %a, double %b) nounwind { +; CHECK-LABEL: test11: +; CHECK: ## BB#0: +; CHECK-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: retl %tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1] %tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1] ret <2 x double> %tmp7 -; CHECK-LABEL: test11: -; CHECK: movaps 4(%esp), %xmm0 } define void @test12() nounwind { - %tmp1 = load <4 x float>* null ; <<4 x float>> [#uses=2] - %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1] - %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] - %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp4, <4 x float>* null - ret void ; CHECK-LABEL: test12: -; CHECK: movhlps -; CHECK: shufps +; CHECK: ## BB#0: +; CHECK-NEXT: movapd 0, %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] +; CHECK-NEXT: movsd %xmm0, %xmm1 +; CHECK-NEXT: xorpd %xmm2, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; CHECK-NEXT: addps %xmm1, %xmm0 +; CHECK-NEXT: movaps %xmm0, 0 +; CHECK-NEXT: retl + %tmp1 = load <4 x float>* null ; <<4 x float>> [#uses=2] + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x i32> < i32 0, i32 1, i32 6, i32 7 > ; <<4 x float>> [#uses=1] + %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] + %tmp4 = fadd <4 x float> %tmp2, %tmp3 ; <<4 x float>> [#uses=1] + store <4 x float> %tmp4, <4 x float>* null + ret void } define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { - %tmp3 = load <4 x float>* %B ; <<4 x float>> [#uses=1] - %tmp5 = load <4 x float>* %C ; <<4 x float>> [#uses=1] - %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - store <4 x float> %tmp11, <4 x float>* %res - ret void -; CHECK: test13 -; CHECK: shufps $69, (%ecx), %xmm0 -; CHECK: pshufd $-40, %xmm0, %xmm0 +; CHECK-LABEL: test13: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movaps (%edx), %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; CHECK-NEXT: movaps %xmm0, (%eax) +; CHECK-NEXT: retl + %tmp3 = load <4 x float>* %B ; <<4 x float>> [#uses=1] + %tmp5 = load <4 x float>* %C ; <<4 x float>> [#uses=1] + %tmp11 = shufflevector <4 x float> %tmp3, <4 x float> %tmp5, <4 x i32> < i32 1, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] + store <4 x float> %tmp11, <4 x float>* %res + ret void } define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind { - %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=2] - %tmp5 = load <4 x float>* %x ; <<4 x float>> [#uses=2] - %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] - %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] - %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp27 ; CHECK-LABEL: test14: -; CHECK: addps [[X1:%xmm[0-9]+]], [[X0:%xmm[0-9]+]] -; CHECK: subps [[X1]], [[X2:%xmm[0-9]+]] -; CHECK: movlhps [[X2]], [[X0]] +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movaps (%ecx), %xmm1 +; CHECK-NEXT: movaps (%eax), %xmm2 +; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: addps %xmm1, %xmm0 +; CHECK-NEXT: subps %xmm1, %xmm2 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: retl + %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=2] + %tmp5 = load <4 x float>* %x ; <<4 x float>> [#uses=2] + %tmp9 = fadd <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] + %tmp21 = fsub <4 x float> %tmp5, %tmp ; <<4 x float>> [#uses=1] + %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 > ; <<4 x float>> [#uses=1] + ret <4 x float> %tmp27 } define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind { -entry: - %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=1] - %tmp3 = load <4 x float>* %x ; <<4 x float>> [#uses=1] - %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp4 ; CHECK-LABEL: test15: -; CHECK: movhlps %xmm1, %xmm0 +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movapd (%ecx), %xmm0 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] +; CHECK-NEXT: retl +entry: + %tmp = load <4 x float>* %y ; <<4 x float>> [#uses=1] + %tmp3 = load <4 x float>* %x ; <<4 x float>> [#uses=1] + %tmp4 = shufflevector <4 x float> %tmp3, <4 x float> %tmp, <4 x i32> < i32 2, i32 3, i32 6, i32 7 > ; <<4 x float>> [#uses=1] + ret <4 x float> %tmp4 } ; PR8900 -; CHECK-LABEL: test16: -; CHECK: unpcklpd -; CHECK: ret define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) { +; CHECK-LABEL: test16: +; CHECK: ## BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movapd 96(%eax), %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: retl %i5 = getelementptr inbounds <4 x double>* %srcA, i32 3 %i6 = load <4 x double>* %i5, align 32 %i7 = shufflevector <4 x double> %i6, <4 x double> undef, <2 x i32> <i32 0, i32 2> @@ -207,6 +264,11 @@ define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocap ; PR9009 define fastcc void @test17() nounwind { +; CHECK-LABEL: test17: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768> +; CHECK-NEXT: movaps %xmm0, (%eax) +; CHECK-NEXT: retl entry: %0 = insertelement <4 x i32> undef, i32 undef, i32 1 %1 = shufflevector <4 x i32> <i32 undef, i32 undef, i32 32768, i32 32768>, <4 x i32> %0, <4 x i32> <i32 4, i32 5, i32 2, i32 3> @@ -217,25 +279,48 @@ entry: ; PR9210 define <4 x float> @f(<4 x double>) nounwind { +; CHECK-LABEL: f: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1 +; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retl entry: %double2float.i = fptrunc <4 x double> %0 to <4 x float> ret <4 x float> %double2float.i } define <2 x i64> @test_insert_64_zext(<2 x i64> %i) { -; CHECK-LABEL: test_insert_64_zext -; CHECK-NOT: xor -; CHECK: movq +; CHECK-LABEL: test_insert_64_zext: +; CHECK: ## BB#0: +; CHECK-NEXT: movq %xmm0, %xmm0 +; CHECK-NEXT: retl %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2> ret <2 x i64> %1 } define <4 x i32> @PR19721(<4 x i32> %i) { +; CHECK-LABEL: PR19721: +; CHECK: ## BB#0: +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movss %xmm1, %xmm0 +; CHECK-NEXT: retl %bc = bitcast <4 x i32> %i to i128 %insert = and i128 %bc, -4294967296 %bc2 = bitcast i128 %insert to <4 x i32> ret <4 x i32> %bc2 +} -; CHECK-LABEL: PR19721 -; CHECK: punpckldq +define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) { +; CHECK-LABEL: test_mul: +; CHECK: ## BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; CHECK-NEXT: retl + %m = mul <4 x i32> %x, %y + ret <4 x i32> %m } diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll index b7706cc34bb6..5b2de28c0f5d 100644 --- a/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE +; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX diff --git a/test/CodeGen/X86/sse3-avx-addsub.ll b/test/CodeGen/X86/sse3-avx-addsub.ll index 8b6674312b34..431588f90ab2 100644 --- a/test/CodeGen/X86/sse3-avx-addsub.ll +++ b/test/CodeGen/X86/sse3-avx-addsub.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK +; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK ; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX -check-prefix=CHECK ; Test ADDSUB ISel patterns. @@ -141,156 +141,3 @@ define <2 x double> @test4b(<2 x double> %A, <2 x double>* %B) { ; AVX: vaddsubpd ; CHECK-NEXT: ret -; Functions below are obtained from the following source: -; -; float4 test1(float4 A, float4 B) { -; float4 X = A + B; -; float4 Y = A - B; -; return (float4){X[0], Y[1], X[2], Y[3]}; -; } -; -; float8 test2(float8 A, float8 B) { -; float8 X = A + B; -; float8 Y = A - B; -; return (float8){X[0], Y[1], X[2], Y[3], X[4], Y[5], X[6], Y[7]}; -; } -; -; double4 test3(double4 A, double4 B) { -; double4 X = A + B; -; double4 Y = A - B; -; return (double4){X[0], Y[1], X[2], Y[3]}; -; } -; -; double2 test4(double2 A, double2 B) { -; double2 X = A + B; -; double2 Y = A - B; -; return (double2){X[0], Y[1]}; -; } - -define <4 x float> @test5(<4 x float> %A, <4 x float> %B) { - %sub = fsub <4 x float> %A, %B - %add = fadd <4 x float> %A, %B - %vecinit6 = shufflevector <4 x float> %add, <4 x float> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x float> %vecinit6 -} -; CHECK-LABEL: test5 -; SSE: xorps -; SSE-NEXT: addsubps -; AVX: vxorps -; AVX-NEXT: vaddsubps -; CHECK: ret - - -define <8 x float> @test6(<8 x float> %A, <8 x float> %B) { - %sub = fsub <8 x float> %A, %B - %add = fadd <8 x float> %A, %B - %vecinit14 = shufflevector <8 x float> %add, <8 x float> %sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> - ret <8 x float> %vecinit14 -} -; CHECK-LABEL: test6 -; SSE: xorps -; SSE-NEXT: addsubps -; SSE: xorps -; SSE-NEXT: addsubps -; AVX: vxorps -; AVX-NEXT: vaddsubps -; AVX-NOT: vxorps -; AVX-NOT: vaddsubps -; CHECK: ret - - -define <4 x double> @test7(<4 x double> %A, <4 x double> %B) { - %sub = fsub <4 x double> %A, %B - %add = fadd <4 x double> %A, %B - %vecinit6 = shufflevector <4 x double> %add, <4 x double> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x double> %vecinit6 -} -; CHECK-LABEL: test7 -; SSE: xorpd -; SSE-NEXT: addsubpd -; SSE: xorpd -; SSE-NEXT: addsubpd -; AVX: vxorpd -; AVX-NEXT: vaddsubpd -; AVX-NOT: vxorpd -; AVX-NOT: vaddsubpd -; CHECK: ret - - -define <2 x double> @test8(<2 x double> %A, <2 x double> %B) #0 { - %add = fadd <2 x double> %A, %B - %sub = fsub <2 x double> %A, %B - %vecinit2 = shufflevector <2 x double> %add, <2 x double> %sub, <2 x i32> <i32 0, i32 3> - ret <2 x double> %vecinit2 -} -; CHECK-LABEL: test8 -; SSE: xorpd -; SSE-NEXT: addsubpd -; AVX: vxorpd -; AVX-NEXT: vaddsubpd -; CHECK: ret - - -define <4 x float> @test5b(<4 x float> %A, <4 x float> %B) { - %sub = fsub <4 x float> %A, %B - %add = fadd <4 x float> %B, %A - %vecinit6 = shufflevector <4 x float> %add, <4 x float> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x float> %vecinit6 -} -; CHECK-LABEL: test5 -; SSE: xorps -; SSE-NEXT: addsubps -; AVX: vxorps -; AVX-NEXT: vaddsubps -; CHECK: ret - - -define <8 x float> @test6b(<8 x float> %A, <8 x float> %B) { - %sub = fsub <8 x float> %A, %B - %add = fadd <8 x float> %B, %A - %vecinit14 = shufflevector <8 x float> %add, <8 x float> %sub, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> - ret <8 x float> %vecinit14 -} -; CHECK-LABEL: test6 -; SSE: xorps -; SSE-NEXT: addsubps -; SSE: xorps -; SSE-NEXT: addsubps -; AVX: vxorps -; AVX-NEXT: vaddsubps -; AVX-NOT: vxorps -; AVX-NOT: vaddsubps -; CHECK: ret - - -define <4 x double> @test7b(<4 x double> %A, <4 x double> %B) { - %sub = fsub <4 x double> %A, %B - %add = fadd <4 x double> %B, %A - %vecinit6 = shufflevector <4 x double> %add, <4 x double> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7> - ret <4 x double> %vecinit6 -} -; CHECK-LABEL: test7 -; SSE: xorpd -; SSE-NEXT: addsubpd -; SSE: xorpd -; SSE-NEXT: addsubpd -; AVX: vxorpd -; AVX-NEXT: vaddsubpd -; AVX-NOT: vxorpd -; AVX-NOT: vaddsubpd -; CHECK: ret - - -define <2 x double> @test8b(<2 x double> %A, <2 x double> %B) #0 { - %add = fadd <2 x double> %B, %A - %sub = fsub <2 x double> %A, %B - %vecinit2 = shufflevector <2 x double> %add, <2 x double> %sub, <2 x i32> <i32 0, i32 3> - ret <2 x double> %vecinit2 -} -; CHECK-LABEL: test8 -; SSE: xorpd -; SSE-NEXT: addsubpd -; AVX: vxorpd -; AVX-NEXT: vaddsubpd -; CHECK: ret - diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll index 18bdcb3912b1..0a5b0cab851c 100644 --- a/test/CodeGen/X86/sse3.ll +++ b/test/CodeGen/X86/sse3.ll @@ -1,99 +1,120 @@ ; These are tests for SSE3 codegen. -; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 \ -; RUN: | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -march=x86-64 -mcpu=nocona -mtriple=i686-apple-darwin9 -O3 | FileCheck %s --check-prefix=X64 ; Test for v8xi16 lowering where we extract the first element of the vector and ; placed it in the second element of the result. define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind { +; X64-LABEL: t0: +; X64: ## BB#0: ## %entry +; X64-NEXT: movl $1, %eax +; X64-NEXT: movd %eax, %xmm0 +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: retq entry: %tmp3 = load <8 x i16>* %old %tmp6 = shufflevector <8 x i16> %tmp3, - <8 x i16> < i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >, + <8 x i16> < i16 1, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef >, <8 x i32> < i32 8, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef > store <8 x i16> %tmp6, <8 x i16>* %dest ret void - -; X64-LABEL: t0: -; X64: movdqa (%rsi), %xmm0 -; X64: pslldq $2, %xmm0 -; X64: movdqa %xmm0, (%rdi) -; X64: ret } define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind { +; X64-LABEL: t1: +; X64: ## BB#0: +; X64-NEXT: movdqa (%rdi), %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-NEXT: retq %tmp1 = load <8 x i16>* %A %tmp2 = load <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > ret <8 x i16> %tmp3 -; X64-LABEL: t1: -; X64: movdqa (%rdi), %xmm0 -; X64: pinsrw $0, (%rsi), %xmm0 -; X64: ret } define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind { +; X64-LABEL: t2: +; X64: ## BB#0: +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,0,3,4,5,6,7] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 > ret <8 x i16> %tmp -; X64-LABEL: t2: -; X64: pextrw $1, %xmm1, %eax -; X64: pinsrw $0, %eax, %xmm0 -; X64: pinsrw $3, %eax, %xmm0 -; X64: ret } define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind { +; X64-LABEL: t3: +; X64: ## BB#0: +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 > ret <8 x i16> %tmp -; X64-LABEL: t3: -; X64: pextrw $5, %xmm0, %eax -; X64: pshuflw $44, %xmm0, %xmm0 -; X64: pshufhw $27, %xmm0, %xmm0 -; X64: pinsrw $3, %eax, %xmm0 -; X64: ret } define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind { +; X64-LABEL: t4: +; X64: ## BB#0: +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,4,7] +; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 > ret <8 x i16> %tmp -; X64-LABEL: t4: -; X64: pextrw $7, [[XMM0:%xmm[0-9]+]], %eax -; X64: pshufhw $100, [[XMM0]], [[XMM1:%xmm[0-9]+]] -; X64: pinsrw $1, %eax, [[XMM1]] -; X64: pextrw $1, [[XMM0]], %eax -; X64: pinsrw $4, %eax, %xmm{{[0-9]}} -; X64: ret } define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind { +; X64-LABEL: t5: +; X64: ## BB#0: +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 > ret <8 x i16> %tmp -; X64: t5: -; X64: movlhps %xmm1, %xmm0 -; X64: pshufd $114, %xmm0, %xmm0 -; X64: ret } define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind { +; X64-LABEL: t6: +; X64: ## BB#0: +; X64-NEXT: movss %xmm1, %xmm0 +; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > ret <8 x i16> %tmp -; X64: t6: -; X64: movss %xmm1, %xmm0 -; X64: ret } define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind { +; X64-LABEL: t7: +; X64: ## BB#0: +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7] +; X64-NEXT: retq %tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 > ret <8 x i16> %tmp -; X64: t7: -; X64: pshuflw $-80, %xmm0, %xmm0 -; X64: pshufhw $-56, %xmm0, %xmm0 -; X64: ret } define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind { +; X64-LABEL: t8: +; X64: ## BB#0: +; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[2,1,0,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; X64-NEXT: movdqa %xmm0, (%rdi) +; X64-NEXT: retq %tmp = load <2 x i64>* %A %tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16> %tmp0 = extractelement <8 x i16> %tmp.upgrd.1, i32 0 @@ -115,14 +136,15 @@ define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind { %tmp15.upgrd.2 = bitcast <8 x i16> %tmp15 to <2 x i64> store <2 x i64> %tmp15.upgrd.2, <2 x i64>* %res ret void -; X64: t8: -; X64: pshuflw $-58, (%rsi), %xmm0 -; X64: pshufhw $-58, %xmm0, %xmm0 -; X64: movdqa %xmm0, (%rdi) -; X64: ret } define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind { +; X64-LABEL: t9: +; X64: ## BB#0: +; X64-NEXT: movapd (%rdi), %xmm0 +; X64-NEXT: movhpd (%rsi), %xmm0 +; X64-NEXT: movapd %xmm0, (%rdi) +; X64-NEXT: retq %tmp = load <4 x float>* %r %tmp.upgrd.3 = bitcast <2 x i32>* %A to double* %tmp.upgrd.4 = load double* %tmp.upgrd.3 @@ -139,11 +161,6 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind { %tmp13 = insertelement <4 x float> %tmp12, float %tmp9, i32 3 store <4 x float> %tmp13, <4 x float>* %r ret void -; X64: t9: -; X64: movaps (%rdi), %xmm0 -; X64: movhps (%rsi), %xmm0 -; X64: movaps %xmm0, (%rdi) -; X64: ret } @@ -154,113 +171,121 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind { @g1 = external constant <4 x i32> @g2 = external constant <4 x i16> -define internal void @t10() nounwind { - load <4 x i32>* @g1, align 16 - bitcast <4 x i32> %1 to <8 x i16> - shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef > - bitcast <8 x i16> %3 to <2 x i64> - extractelement <2 x i64> %4, i32 0 - bitcast i64 %5 to <4 x i16> - store <4 x i16> %6, <4 x i16>* @g2, align 8 - ret void -; X64: t10: -; X64: pextrw $4, [[X0:%xmm[0-9]+]], %e{{..}} -; X64: pextrw $6, [[X0]], %e{{..}} -; X64: movlhps [[X0]], [[X0]] -; X64: pshuflw $8, [[X0]], [[X0]] -; X64: pinsrw $2, %e{{..}}, [[X0]] -; X64: pinsrw $3, %e{{..}}, [[X0]] +define void @t10() nounwind { +; X64-LABEL: t10: +; X64: ## BB#0: +; X64-NEXT: movq _g1@{{.*}}(%rip), %rax +; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: movq _g2@{{.*}}(%rip), %rax +; X64-NEXT: movq %xmm0, (%rax) +; X64-NEXT: retq + load <4 x i32>* @g1, align 16 + bitcast <4 x i32> %1 to <8 x i16> + shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> < i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef > + bitcast <8 x i16> %3 to <2 x i64> + extractelement <2 x i64> %4, i32 0 + bitcast i64 %5 to <4 x i16> + store <4 x i16> %6, <4 x i16>* @g2, align 8 + ret void } - ; Pack various elements via shuffles. define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X64-LABEL: t11: +; X64: ## BB#0: ## %entry +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; X64-NEXT: retq entry: %tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > ret <8 x i16> %tmp7 -; X64-LABEL: t11: -; X64: movd %xmm1, %eax -; X64: movlhps %xmm0, %xmm0 -; X64: pshuflw $1, %xmm0, %xmm0 -; X64: pinsrw $1, %eax, %xmm0 -; X64: ret } - define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X64-LABEL: t12: +; X64: ## BB#0: ## %entry +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7] +; X64-NEXT: retq entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > ret <8 x i16> %tmp9 -; X64-LABEL: t12: -; X64: pextrw $3, %xmm1, %eax -; X64: movlhps %xmm0, %xmm0 -; X64: pshufhw $3, %xmm0, %xmm0 -; X64: pinsrw $5, %eax, %xmm0 -; X64: ret } - define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X64-LABEL: t13: +; X64: ## BB#0: ## %entry +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,6,7] +; X64-NEXT: retq entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 11, i32 3, i32 undef , i32 undef > ret <8 x i16> %tmp9 -; X64-LABEL: t13: -; X64: punpcklqdq %xmm0, %xmm1 -; X64: pextrw $3, %xmm1, %eax -; X64: pshufhw $12, %xmm1, %xmm0 -; X64: pinsrw $4, %eax, %xmm0 -; X64: ret } - define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X64-LABEL: t14: +; X64: ## BB#0: ## %entry +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; X64-NEXT: retq entry: %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef , i32 undef > ret <8 x i16> %tmp9 -; X64-LABEL: t14: -; X64: punpcklqdq %xmm0, %xmm1 -; X64: pshufhw $8, %xmm1, %xmm0 -; X64: ret } - ; FIXME: t15 is worse off from disabling of scheduler 2-address hack. define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { +; X64-LABEL: t15: +; X64: ## BB#0: ## %entry +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; X64-NEXT: retq entry: - %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > - ret <8 x i16> %tmp8 -; X64: t15: -; X64: pextrw $7, %xmm0, %eax -; X64: punpcklqdq %xmm1, %xmm0 -; X64: pshuflw $-128, %xmm0, %xmm0 -; X64: pinsrw $2, %eax, %xmm0 -; X64: ret + %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > + ret <8 x i16> %tmp8 } - ; Test yonah where we convert a shuffle to pextrw and pinrsw define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone { +; X64-LABEL: t16: +; X64: ## BB#0: ## %entry +; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0] +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: packuswb %xmm0, %xmm0 +; X64-NEXT: retq entry: - %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > - %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 2, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > - ret <16 x i8> %tmp9 -; X64: t16: -; X64: pextrw $8, %xmm0, %eax -; X64: pslldq $2, %xmm0 -; X64: pextrw $1, %xmm0, %ecx -; X64: movzbl %cl, %ecx -; X64: orl %eax, %ecx -; X64: pinsrw $1, %ecx, %xmm0 -; X64: ret + %tmp8 = shufflevector <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > + %tmp9 = shufflevector <16 x i8> %tmp8, <16 x i8> %T0, <16 x i32> < i32 0, i32 1, i32 2, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > + ret <16 x i8> %tmp9 } ; rdar://8520311 define <4 x i32> @t17() nounwind { -entry: ; X64-LABEL: t17: -; X64: movddup (%rax), %xmm0 +; X64: ## BB#0: ## %entry +; X64-NEXT: movddup (%rax), %xmm0 +; X64-NEXT: andpd {{.*}}(%rip), %xmm0 +; X64-NEXT: retq +entry: %tmp1 = load <4 x float>* undef, align 16 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 4, i32 1, i32 2, i32 3> %tmp3 = load <4 x float>* undef, align 16 diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll deleted file mode 100644 index 3a4812119f8a..000000000000 --- a/test/CodeGen/X86/sse41-blend.ll +++ /dev/null @@ -1,140 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s - -;CHECK-LABEL: vsel_float: -;CHECK: blendps -;CHECK: ret -define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2 - ret <4 x float> %vsel -} - - -;CHECK-LABEL: vsel_4xi8: -;CHECK: blendps -;CHECK: ret -define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { - %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2 - ret <4 x i8> %vsel -} - -;CHECK-LABEL: vsel_4xi16: -;CHECK: blendps -;CHECK: ret -define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2 - ret <4 x i16> %vsel -} - - -;CHECK-LABEL: vsel_i32: -;CHECK: blendps -;CHECK: ret -define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { - %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2 - ret <4 x i32> %vsel -} - - -;CHECK-LABEL: vsel_double: -;CHECK: movsd -;CHECK: ret -define <4 x double> @vsel_double(<4 x double> %v1, <4 x double> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> %v1, <4 x double> %v2 - ret <4 x double> %vsel -} - - -;CHECK-LABEL: vsel_i64: -;CHECK: movsd -;CHECK: ret -define <4 x i64> @vsel_i64(<4 x i64> %v1, <4 x i64> %v2) { - %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> %v1, <4 x i64> %v2 - ret <4 x i64> %vsel -} - - -;CHECK-LABEL: vsel_i8: -;CHECK: pblendvb -;CHECK: ret -define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { - %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2 - ret <16 x i8> %vsel -} - -;; TEST blend + compares -; CHECK: A -define <2 x double> @A(<2 x double> %x, <2 x double> %y) { - ; CHECK: cmplepd - ; CHECK: blendvpd - %max_is_x = fcmp oge <2 x double> %x, %y - %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y - ret <2 x double> %max -} - -; CHECK: B -define <2 x double> @B(<2 x double> %x, <2 x double> %y) { - ; CHECK: cmpnlepd - ; CHECK: blendvpd - %min_is_x = fcmp ult <2 x double> %x, %y - %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y - ret <2 x double> %min -} - -; CHECK: float_crash -define void @float_crash() nounwind { -entry: - %merge205vector_func.i = select <4 x i1> undef, <4 x double> undef, <4 x double> undef - %extract214vector_func.i = extractelement <4 x double> %merge205vector_func.i, i32 0 - store double %extract214vector_func.i, double addrspace(1)* undef, align 8 - ret void -} - -; If we can figure out a blend has a constant mask, we should emit the -; blend instruction with an immediate mask -define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) { -; In this case, we emit a simple movss -; CHECK-LABEL: constant_blendvpd -; CHECK: movsd -; CHECK: ret - %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %xy, <2 x double> %ab - ret <2 x double> %1 -} - -define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) { -; CHECK-LABEL: constant_blendvps -; CHECK-NOT: mov -; CHECK: blendps $7 -; CHECK: ret - %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %xyzw, <4 x float> %abcd - ret <4 x float> %1 -} - -define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) { -; CHECK-LABEL: constant_pblendvb: -; CHECK: movaps -; CHECK: pblendvb -; CHECK: ret - %1 = select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %xyzw, <16 x i8> %abcd - ret <16 x i8> %1 -} - -declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) -declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) -declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) - -;; 2 tests for shufflevectors that optimize to blend + immediate -; CHECK-LABEL: @blend_shufflevector_4xfloat -; CHECK: blendps $6, %xmm1, %xmm0 -; CHECK: ret -define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) { - %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 3> - ret <4 x float> %1 -} - -; CHECK-LABEL: @blend_shufflevector_8xi16 -; CHECK: pblendw $134, %xmm1, %xmm0 -; CHECK: ret -define <8 x i16> @blend_shufflevector_8xi16(<8 x i16> %a, <8 x i16> %b) { - %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 15> - ret <8 x i16> %1 -} diff --git a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll new file mode 100644 index 000000000000..6fab98e70a89 --- /dev/null +++ b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll @@ -0,0 +1,61 @@ +; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.1 | FileCheck %s +; This test works just like the non-upgrade one except that it only checks +; forms which require auto-upgrading. + +define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) { + ; CHECK: blendpd + %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone + + +define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) { + ; CHECK: blendps + %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone + + +define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) { + ; CHECK: dppd + %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone + + +define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) { + ; CHECK: dpps + %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone + + +define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) { + ; CHECK: insertps + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone + + + +define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) { + ; CHECK: mpsadbw + %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone + + +define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) { + ; CHECK: pblendw + %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone + + diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll index 37eff43b28c4..5f25a16380de 100644 --- a/test/CodeGen/X86/sse41-intrinsics-x86.ll +++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll @@ -2,18 +2,18 @@ define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) { ; CHECK: blendpd - %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1] + %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1] ret <2 x double> %res } -declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone +declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) { ; CHECK: blendps - %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } -declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { @@ -34,35 +34,35 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) { ; CHECK: dppd - %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1] + %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1] ret <2 x double> %res } -declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone +declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) { ; CHECK: dpps - %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } -declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) { ; CHECK: insertps - %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1] + %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } -declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK: mpsadbw - %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1] + %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } -declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone +declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) { @@ -83,10 +83,10 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK: pblendw - %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1] + %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1] ret <8 x i16> %res } -declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone +declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) { diff --git a/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll b/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll new file mode 100644 index 000000000000..55faf4d32b36 --- /dev/null +++ b/test/CodeGen/X86/sse41-pmovxrm-intrinsics.ll @@ -0,0 +1,123 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX + +define <8 x i16> @test_llvm_x86_sse41_pmovsxbw(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovsxbw +; SSE41: pmovsxbw (%rdi), %xmm0 +; AVX: vpmovsxbw (%rdi), %xmm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %1) + ret <8 x i16> %2 +} + +define <4 x i32> @test_llvm_x86_sse41_pmovsxbd(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovsxbd +; SSE41: pmovsxbd (%rdi), %xmm0 +; AVX: vpmovsxbd (%rdi), %xmm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @test_llvm_x86_sse41_pmovsxbq(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovsxbq +; SSE41: pmovsxbq (%rdi), %xmm0 +; AVX: vpmovsxbq (%rdi), %xmm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %1) + ret <2 x i64> %2 +} + +define <4 x i32> @test_llvm_x86_sse41_pmovsxwd(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovsxwd +; SSE41: pmovsxwd (%rdi), %xmm0 +; AVX: vpmovsxwd (%rdi), %xmm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @test_llvm_x86_sse41_pmovsxwq(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovsxwq +; SSE41: pmovsxwq (%rdi), %xmm0 +; AVX: vpmovsxwq (%rdi), %xmm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %1) + ret <2 x i64> %2 +} + +define <2 x i64> @test_llvm_x86_sse41_pmovsxdq(<4 x i32>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovsxdq +; SSE41: pmovsxdq (%rdi), %xmm0 +; AVX: vpmovsxdq (%rdi), %xmm0 + %1 = load <4 x i32>* %a, align 1 + %2 = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %1) + ret <2 x i64> %2 +} + +define <8 x i16> @test_llvm_x86_sse41_pmovzxbw(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovzxbw +; SSE41: pmovzxbw (%rdi), %xmm0 +; AVX: vpmovzxbw (%rdi), %xmm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %1) + ret <8 x i16> %2 +} + +define <4 x i32> @test_llvm_x86_sse41_pmovzxbd(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovzxbd +; SSE41: pmovzxbd (%rdi), %xmm0 +; AVX: vpmovzxbd (%rdi), %xmm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @test_llvm_x86_sse41_pmovzxbq(<16 x i8>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovzxbq +; SSE41: pmovzxbq (%rdi), %xmm0 +; AVX: vpmovzxbq (%rdi), %xmm0 + %1 = load <16 x i8>* %a, align 1 + %2 = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %1) + ret <2 x i64> %2 +} + +define <4 x i32> @test_llvm_x86_sse41_pmovzxwd(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovzxwd +; SSE41: pmovzxwd (%rdi), %xmm0 +; AVX: vpmovzxwd (%rdi), %xmm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %1) + ret <4 x i32> %2 +} + +define <2 x i64> @test_llvm_x86_sse41_pmovzxwq(<8 x i16>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovzxwq +; SSE41: pmovzxwq (%rdi), %xmm0 +; AVX: vpmovzxwq (%rdi), %xmm0 + %1 = load <8 x i16>* %a, align 1 + %2 = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %1) + ret <2 x i64> %2 +} + +define <2 x i64> @test_llvm_x86_sse41_pmovzxdq(<4 x i32>* %a) { +; CHECK-LABEL: test_llvm_x86_sse41_pmovzxdq +; SSE41: pmovzxdq (%rdi), %xmm0 +; AVX: vpmovzxdq (%rdi), %xmm0 + %1 = load <4 x i32>* %a, align 1 + %2 = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %1) + ret <2 x i64> %2 +} + +declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) +declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) +declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) +declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) +declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) +declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) +declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) +declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) +declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) +declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) +declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) +declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index 986488f531ec..9c0c2221cb7f 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -1,30 +1,47 @@ -; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK -; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK +; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64 @g16 = external global i16 define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind { - %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1 - ret <4 x i32> %tmp1 ; X32-LABEL: pinsrd_1: -; X32: pinsrd $1, 4(%esp), %xmm0 - +; X32: ## BB#0: +; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0 +; X32-NEXT: retl +; ; X64-LABEL: pinsrd_1: -; X64: pinsrd $1, %edi, %xmm0 +; X64: ## BB#0: +; X64-NEXT: pinsrd $1, %edi, %xmm0 +; X64-NEXT: retq + %tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1 + ret <4 x i32> %tmp1 } define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind { - %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1 - ret <16 x i8> %tmp1 ; X32-LABEL: pinsrb_1: -; X32: pinsrb $1, 4(%esp), %xmm0 - +; X32: ## BB#0: +; X32-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0 +; X32-NEXT: retl +; ; X64-LABEL: pinsrb_1: -; X64: pinsrb $1, %edi, %xmm0 +; X64: ## BB#0: +; X64-NEXT: pinsrb $1, %edi, %xmm0 +; X64-NEXT: retq + %tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1 + ret <16 x i8> %tmp1 } - define <2 x i64> @pmovsxbd_1(i32* %p) nounwind { +; X32-LABEL: pmovsxbd_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: pmovsxbd (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: pmovsxbd_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: pmovsxbd (%rdi), %xmm0 +; X64-NEXT: retq entry: %0 = load i32* %p, align 4 %1 = insertelement <4 x i32> undef, i32 %0, i32 0 @@ -35,16 +52,19 @@ entry: %6 = tail call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %5) nounwind readnone %7 = bitcast <4 x i32> %6 to <2 x i64> ret <2 x i64> %7 - -; X32: _pmovsxbd_1: -; X32: movl 4(%esp), %eax -; X32: pmovsxbd (%eax), %xmm0 - -; X64: _pmovsxbd_1: -; X64: pmovsxbd (%rdi), %xmm0 } define <2 x i64> @pmovsxwd_1(i64* %p) nounwind readonly { +; X32-LABEL: pmovsxwd_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: pmovsxwd (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: pmovsxwd_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: pmovsxwd (%rdi), %xmm0 +; X64-NEXT: retq entry: %0 = load i64* %p ; <i64> [#uses=1] %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %0, i32 0 ; <<2 x i64>> [#uses=1] @@ -52,63 +72,59 @@ entry: %2 = tail call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %1) nounwind readnone ; <<4 x i32>> [#uses=1] %3 = bitcast <4 x i32> %2 to <2 x i64> ; <<2 x i64>> [#uses=1] ret <2 x i64> %3 - -; X32: _pmovsxwd_1: -; X32: movl 4(%esp), %eax -; X32: pmovsxwd (%eax), %xmm0 - -; X64: _pmovsxwd_1: -; X64: pmovsxwd (%rdi), %xmm0 } - - - define <2 x i64> @pmovzxbq_1() nounwind { +; X32-LABEL: pmovzxbq_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: movl L_g16$non_lazy_ptr, %eax +; X32-NEXT: pmovzxbq (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: pmovzxbq_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: movq _g16@{{.*}}(%rip), %rax +; X64-NEXT: pmovzxbq (%rax), %xmm0 +; X64-NEXT: retq entry: %0 = load i16* @g16, align 2 ; <i16> [#uses=1] %1 = insertelement <8 x i16> undef, i16 %0, i32 0 ; <<8 x i16>> [#uses=1] %2 = bitcast <8 x i16> %1 to <16 x i8> ; <<16 x i8>> [#uses=1] %3 = tail call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %2) nounwind readnone ; <<2 x i64>> [#uses=1] ret <2 x i64> %3 - -; X32: _pmovzxbq_1: -; X32: movl L_g16$non_lazy_ptr, %eax -; X32: pmovzxbq (%eax), %xmm0 - -; X64: _pmovzxbq_1: -; X64: movq _g16@GOTPCREL(%rip), %rax -; X64: pmovzxbq (%rax), %xmm0 } declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone - - - define i32 @extractps_1(<4 x float> %v) nounwind { +; X32-LABEL: extractps_1: +; X32: ## BB#0: +; X32-NEXT: extractps $3, %xmm0, %eax +; X32-NEXT: retl +; +; X64-LABEL: extractps_1: +; X64: ## BB#0: +; X64-NEXT: extractps $3, %xmm0, %eax +; X64-NEXT: retq %s = extractelement <4 x float> %v, i32 3 %i = bitcast float %s to i32 ret i32 %i - -; X32: _extractps_1: -; X32: extractps $3, %xmm0, %eax - -; X64: _extractps_1: -; X64: extractps $3, %xmm0, %eax } define i32 @extractps_2(<4 x float> %v) nounwind { +; X32-LABEL: extractps_2: +; X32: ## BB#0: +; X32-NEXT: extractps $3, %xmm0, %eax +; X32-NEXT: retl +; +; X64-LABEL: extractps_2: +; X64: ## BB#0: +; X64-NEXT: extractps $3, %xmm0, %eax +; X64-NEXT: retq %t = bitcast <4 x float> %v to <4 x i32> %s = extractelement <4 x i32> %t, i32 3 ret i32 %s - -; X32: _extractps_2: -; X32: extractps $3, %xmm0, %eax - -; X64: _extractps_2: -; X64: extractps $3, %xmm0, %eax } @@ -117,106 +133,152 @@ define i32 @extractps_2(<4 x float> %v) nounwind { ; is bitcasted to i32, but unsuitable for much of anything else. define float @ext_1(<4 x float> %v) nounwind { +; X32-LABEL: ext_1: +; X32: ## BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X32-NEXT: addss LCPI7_0, %xmm0 +; X32-NEXT: movss %xmm0, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: popl %eax +; X32-NEXT: retl +; +; X64-LABEL: ext_1: +; X64: ## BB#0: +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X64-NEXT: addss {{.*}}(%rip), %xmm0 +; X64-NEXT: retq %s = extractelement <4 x float> %v, i32 3 %t = fadd float %s, 1.0 ret float %t - -; X32: _ext_1: -; X32: pshufd $3, %xmm0, %xmm0 -; X32: addss LCPI7_0, %xmm0 - -; X64: _ext_1: -; X64: pshufd $3, %xmm0, %xmm0 -; X64: addss LCPI7_0(%rip), %xmm0 } define float @ext_2(<4 x float> %v) nounwind { +; X32-LABEL: ext_2: +; X32: ## BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X32-NEXT: movss %xmm0, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: popl %eax +; X32-NEXT: retl +; +; X64-LABEL: ext_2: +; X64: ## BB#0: +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X64-NEXT: retq %s = extractelement <4 x float> %v, i32 3 ret float %s - -; X32: _ext_2: -; X32: pshufd $3, %xmm0, %xmm0 - -; X64: _ext_2: -; X64: pshufd $3, %xmm0, %xmm0 } define i32 @ext_3(<4 x i32> %v) nounwind { +; X32-LABEL: ext_3: +; X32: ## BB#0: +; X32-NEXT: pextrd $3, %xmm0, %eax +; X32-NEXT: retl +; +; X64-LABEL: ext_3: +; X64: ## BB#0: +; X64-NEXT: pextrd $3, %xmm0, %eax +; X64-NEXT: retq %i = extractelement <4 x i32> %v, i32 3 ret i32 %i - -; X32: _ext_3: -; X32: pextrd $3, %xmm0, %eax - -; X64: _ext_3: -; X64: pextrd $3, %xmm0, %eax } define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind { - %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone - ret <4 x float> %tmp1 -; X32: _insertps_1: -; X32: insertps $1, %xmm1, %xmm0 - -; X64: _insertps_1: -; X64: insertps $1, %xmm1, %xmm0 +; X32-LABEL: insertps_1: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: insertps_1: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1,2,3] +; X64-NEXT: retq + %tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 1) nounwind readnone + ret <4 x float> %tmp1 } declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind { - %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 - ret <4 x float> %tmp1 -; X32: _insertps_2: -; X32: insertps $0, 4(%esp), %xmm0 - -; X64: _insertps_2: -; X64: insertps $0, %xmm1, %xmm0 +; X32-LABEL: insertps_2: +; X32: ## BB#0: +; X32-NEXT: insertps $0, {{[0-9]+}}(%esp), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_2: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: retq + %tmp1 = insertelement <4 x float> %t1, float %t2, i32 0 + ret <4 x float> %tmp1 } - define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind { - %tmp2 = extractelement <4 x float> %t2, i32 0 - %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0 - ret <4 x float> %tmp1 -; X32: _insertps_3: -; X32: insertps $0, %xmm1, %xmm0 - -; X64: _insertps_3: -; X64: insertps $0, %xmm1, %xmm0 +; X32-LABEL: insertps_3: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: insertps_3: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; X64-NEXT: retq + %tmp2 = extractelement <4 x float> %t2, i32 0 + %tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0 + ret <4 x float> %tmp1 } define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind { - %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone - ret i32 %tmp1 -; X32: _ptestz_1: -; X32: ptest %xmm1, %xmm0 -; X32: sete %al - -; X64: _ptestz_1: -; X64: ptest %xmm1, %xmm0 -; X64: sete %al +; X32-LABEL: ptestz_1: +; X32: ## BB#0: +; X32-NEXT: ptest %xmm1, %xmm0 +; X32-NEXT: sete %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: ptestz_1: +; X64: ## BB#0: +; X64-NEXT: ptest %xmm1, %xmm0 +; X64-NEXT: sete %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq + %tmp1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone + ret i32 %tmp1 } define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind { - %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone - ret i32 %tmp1 -; X32: _ptestz_2: -; X32: ptest %xmm1, %xmm0 -; X32: sbbl %eax - -; X64: _ptestz_2: -; X64: ptest %xmm1, %xmm0 -; X64: sbbl %eax +; X32-LABEL: ptestz_2: +; X32: ## BB#0: +; X32-NEXT: ptest %xmm1, %xmm0 +; X32-NEXT: sbbl %eax, %eax +; X32-NEXT: andl $1, %eax +; X32-NEXT: retl +; +; X64-LABEL: ptestz_2: +; X64: ## BB#0: +; X64-NEXT: ptest %xmm1, %xmm0 +; X64-NEXT: sbbl %eax, %eax +; X64-NEXT: andl $1, %eax +; X64-NEXT: retq + %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone + ret i32 %tmp1 } define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind { - %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone - ret i32 %tmp1 -; X32: _ptestz_3: -; X32: ptest %xmm1, %xmm0 -; X32: seta %al - -; X64: _ptestz_3: -; X64: ptest %xmm1, %xmm0 -; X64: seta %al +; X32-LABEL: ptestz_3: +; X32: ## BB#0: +; X32-NEXT: ptest %xmm1, %xmm0 +; X32-NEXT: seta %al +; X32-NEXT: movzbl %al, %eax +; X32-NEXT: retl +; +; X64-LABEL: ptestz_3: +; X64: ## BB#0: +; X64-NEXT: ptest %xmm1, %xmm0 +; X64-NEXT: seta %al +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: retq + %tmp1 = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone + ret i32 %tmp1 } @@ -227,6 +289,25 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone ; This used to compile to insertps $0 + insertps $16. insertps $0 is always ; pointless. define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { +; X32-LABEL: buildvector: +; X32: ## BB#0: ## %entry +; X32-NEXT: movaps %xmm0, %xmm2 +; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; X32-NEXT: addss %xmm1, %xmm0 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X32-NEXT: addss %xmm2, %xmm1 +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X32-NEXT: retl +; +; X64-LABEL: buildvector: +; X64: ## BB#0: ## %entry +; X64-NEXT: movaps %xmm0, %xmm2 +; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; X64-NEXT: addss %xmm1, %xmm0 +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X64-NEXT: addss %xmm2, %xmm1 +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X64-NEXT: retq entry: %tmp7 = extractelement <2 x float> %A, i32 0 %tmp5 = extractelement <2 x float> %A, i32 1 @@ -237,97 +318,124 @@ entry: %tmp11 = insertelement <2 x float> undef, float %add.r, i32 0 %tmp9 = insertelement <2 x float> %tmp11, float %add.i, i32 1 ret <2 x float> %tmp9 -; X32-LABEL: buildvector: -; X32-NOT: insertps $0 -; X32: insertps $16 -; X32-NOT: insertps $0 -; X32: ret -; X64-LABEL: buildvector: -; X64-NOT: insertps $0 -; X64: insertps $16 -; X64-NOT: insertps $0 -; X64: ret } define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; X32-LABEL: insertps_from_shufflevector_1: +; X32: ## BB#0: ## %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: insertps $48, (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_shufflevector_1: +; X64: ## BB#0: ## %entry +; X64-NEXT: insertps $48, (%rdi), %xmm0 +; X64-NEXT: retq entry: %0 = load <4 x float>* %pb, align 16 %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> ret <4 x float> %vecinit6 -; CHECK-LABEL: insertps_from_shufflevector_1: -; CHECK-NOT: movss -; CHECK-NOT: shufps -; CHECK: insertps $48, -; CHECK: ret } define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) { +; X32-LABEL: insertps_from_shufflevector_2: +; X32: ## BB#0: ## %entry +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3] +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_shufflevector_2: +; X64: ## BB#0: ## %entry +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3] +; X64-NEXT: retq entry: %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3> ret <4 x float> %vecinit6 -; CHECK-LABEL: insertps_from_shufflevector_2: -; CHECK-NOT: shufps -; CHECK: insertps $96, -; CHECK: ret } ; For loading an i32 from memory into an xmm register we use pinsrd ; instead of insertps define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) { +; X32-LABEL: pinsrd_from_shufflevector_i32: +; X32: ## BB#0: ## %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: insertps $48, (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: pinsrd_from_shufflevector_i32: +; X64: ## BB#0: ## %entry +; X64-NEXT: insertps $48, (%rdi), %xmm0 +; X64-NEXT: retq entry: %0 = load <4 x i32>* %pb, align 16 %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> ret <4 x i32> %vecinit6 -; CHECK-LABEL: pinsrd_from_shufflevector_i32: -; CHECK-NOT: movss -; CHECK-NOT: shufps -; CHECK: pinsrd $3, -; CHECK: ret } define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) { +; X32-LABEL: insertps_from_shufflevector_i32_2: +; X32: ## BB#0: ## %entry +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[3],xmm0[2,3] +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_shufflevector_i32_2: +; X64: ## BB#0: ## %entry +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[3],xmm0[2,3] +; X64-NEXT: retq entry: %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3> ret <4 x i32> %vecinit6 -; CHECK-LABEL: insertps_from_shufflevector_i32_2: -; CHECK-NOT: shufps -; CHECK-NOT: movaps -; CHECK: insertps $208, -; CHECK: ret } define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) { -; CHECK-LABEL: insertps_from_load_ins_elt_undef: -; CHECK-NOT: movss -; CHECK-NOT: shufps -; CHECK: insertps $16, -; CHECK: ret +; X32-LABEL: insertps_from_load_ins_elt_undef: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: insertps $16, (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_load_ins_elt_undef: +; X64: ## BB#0: +; X64-NEXT: insertps $16, (%rdi), %xmm0 +; X64-NEXT: retq %1 = load float* %b, align 4 %2 = insertelement <4 x float> undef, float %1, i32 0 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3> ret <4 x float> %result } -define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) { -; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32: ; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr -;; aCHECK-NOT: movd -; CHECK-NOT: shufps -; CHECK: insertps $32, -; CHECK: ret +define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) { +; X32-LABEL: insertps_from_load_ins_elt_undef_i32: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movd (%eax), %xmm1 +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_load_ins_elt_undef_i32: +; X64: ## BB#0: +; X64-NEXT: movd (%rdi), %xmm1 +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; X64-NEXT: retq %1 = load i32* %b, align 4 %2 = insertelement <4 x i32> undef, i32 %1, i32 0 %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3> ret <4 x i32> %result } -;;;;;; Shuffles optimizable with a single insertps instruction +;;;;;; Shuffles optimizable with a single insertps or blend instruction define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_XYZ0: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $8 -; CHECK: ret +; X32-LABEL: shuf_XYZ0: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm1, %xmm1 +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; X32-NEXT: retl +; +; X64-LABEL: shuf_XYZ0: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecext1 = extractelement <4 x float> %x, i32 1 @@ -339,11 +447,15 @@ define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_XY00: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $12 -; CHECK: ret +; X32-LABEL: shuf_XY00: +; X32: ## BB#0: +; X32-NEXT: movq %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: shuf_XY00: +; X64: ## BB#0: +; X64-NEXT: movq %xmm0, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecext1 = extractelement <4 x float> %x, i32 1 @@ -354,11 +466,15 @@ define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_XYY0: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $104 -; CHECK: ret +; X32-LABEL: shuf_XYY0: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero +; X32-NEXT: retl +; +; X64-LABEL: shuf_XYY0: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecext1 = extractelement <4 x float> %x, i32 1 @@ -369,9 +485,15 @@ define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_XYW0: -; CHECK: insertps $232 -; CHECK: ret +; X32-LABEL: shuf_XYW0: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero +; X32-NEXT: retl +; +; X64-LABEL: shuf_XYW0: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecext1 = extractelement <4 x float> %x, i32 1 @@ -383,11 +505,15 @@ define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_W00W: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $198 -; CHECK: ret +; X32-LABEL: shuf_W00W: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] +; X32-NEXT: retl +; +; X64-LABEL: shuf_W00W: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 3 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1 @@ -397,11 +523,19 @@ define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_X00A: -; CHECK-NOT: movaps -; CHECK-NOT: shufps -; CHECK: insertps $48 -; CHECK: ret +; X32-LABEL: shuf_X00A: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm2, %xmm2 +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: retl +; +; X64-LABEL: shuf_X00A: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 @@ -411,11 +545,21 @@ define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_X00X: -; CHECK-NOT: movaps -; CHECK-NOT: shufps -; CHECK: insertps $48 -; CHECK: ret +; X32-LABEL: shuf_X00X: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm1, %xmm1 +; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: shuf_X00X: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 @@ -425,12 +569,23 @@ define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) { } define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { -; CHECK-LABEL: shuf_X0YC: -; CHECK: shufps -; CHECK-NOT: movhlps -; CHECK-NOT: shufps -; CHECK: insertps $176 -; CHECK: ret +; X32-LABEL: shuf_X0YC: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm2, %xmm2 +; X32-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3] +; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero +; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] +; X32-NEXT: movaps %xmm2, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: shuf_X0YC: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: blendps {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3] +; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero +; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] +; X64-NEXT: movaps %xmm2, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1 @@ -440,11 +595,17 @@ define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) { } define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_XYZ0: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $8 -; CHECK: ret +; X32-LABEL: i32_shuf_XYZ0: +; X32: ## BB#0: +; X32-NEXT: pxor %xmm1, %xmm1 +; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_XYZ0: +; X64: ## BB#0: +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecext1 = extractelement <4 x i32> %x, i32 1 @@ -456,11 +617,15 @@ define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_XY00: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $12 -; CHECK: ret +; X32-LABEL: i32_shuf_XY00: +; X32: ## BB#0: +; X32-NEXT: movq %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_XY00: +; X64: ## BB#0: +; X64-NEXT: movq %xmm0, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecext1 = extractelement <4 x i32> %x, i32 1 @@ -471,11 +636,15 @@ define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_XYY0: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $104 -; CHECK: ret +; X32-LABEL: i32_shuf_XYY0: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_XYY0: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecext1 = extractelement <4 x i32> %x, i32 1 @@ -486,11 +655,15 @@ define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_XYW0: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $232 -; CHECK: ret +; X32-LABEL: i32_shuf_XYW0: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_XYW0: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecext1 = extractelement <4 x i32> %x, i32 1 @@ -502,11 +675,15 @@ define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_W00W: -; CHECK-NOT: pextrd -; CHECK-NOT: punpckldq -; CHECK: insertps $198 -; CHECK: ret +; X32-LABEL: i32_shuf_W00W: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_W00W: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3] +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 3 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1 @@ -516,11 +693,19 @@ define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_X00A: -; CHECK-NOT: movaps -; CHECK-NOT: shufps -; CHECK: insertps $48 -; CHECK: ret +; X32-LABEL: i32_shuf_X00A: +; X32: ## BB#0: +; X32-NEXT: pxor %xmm2, %xmm2 +; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_X00A: +; X64: ## BB#0: +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 @@ -530,11 +715,21 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_X00X: -; CHECK-NOT: movaps -; CHECK-NOT: shufps -; CHECK: insertps $48 -; CHECK: ret +; X32-LABEL: i32_shuf_X00X: +; X32: ## BB#0: +; X32-NEXT: pxor %xmm1, %xmm1 +; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_X00X: +; X64: ## BB#0: +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 @@ -544,12 +739,21 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { -; CHECK-LABEL: i32_shuf_X0YC: -; CHECK: shufps -; CHECK-NOT: movhlps -; CHECK-NOT: shufps -; CHECK: insertps $176 -; CHECK: ret +; X32-LABEL: i32_shuf_X0YC: +; X32: ## BB#0: +; X32-NEXT: pmovzxdq %xmm0, %xmm2 +; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero +; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] +; X32-NEXT: movaps %xmm2, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: i32_shuf_X0YC: +; X64: ## BB#0: +; X64-NEXT: pmovzxdq %xmm0, %xmm2 +; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[1],zero +; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[2] +; X64-NEXT: movaps %xmm2, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 @@ -560,11 +764,19 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { ;; Test for a bug in the first implementation of LowerBuildVectorv4x32 define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { -; CHECK-LABEL: test_insertps_no_undef: -; CHECK: movaps %xmm0, %xmm1 -; CHECK-NEXT: insertps $8, %xmm1, %xmm1 -; CHECK-NEXT: maxps %xmm1, %xmm0 -; CHECK-NEXT: ret +; X32-LABEL: test_insertps_no_undef: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm1, %xmm1 +; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; X32-NEXT: maxps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: test_insertps_no_undef: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; X64-NEXT: maxps %xmm1, %xmm0 +; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 %vecext1 = extractelement <4 x float> %x, i32 1 @@ -578,48 +790,75 @@ define < 4 x float> @test_insertps_no_undef(<4 x float> %x) { } define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) { -; CHECK-LABEL: blendvb_fallback -; CHECK: blendvb -; CHECK: ret +; X32-LABEL: blendvb_fallback: +; X32: ## BB#0: +; X32-NEXT: psllw $15, %xmm0 +; X32-NEXT: psraw $15, %xmm0 +; X32-NEXT: pblendvb %xmm1, %xmm2 +; X32-NEXT: movdqa %xmm2, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: blendvb_fallback: +; X64: ## BB#0: +; X64-NEXT: psllw $15, %xmm0 +; X64-NEXT: psraw $15, %xmm0 +; X64-NEXT: pblendvb %xmm1, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: retq %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y ret <8 x i16> %ret } -define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { -; CHECK-LABEL: insertps_from_vector_load: ; On X32, account for the argument's move to registers -; X32: movl 4(%esp), %eax -; CHECK-NOT: mov -; CHECK: insertps $48 -; CHECK-NEXT: ret +define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; X32-LABEL: insertps_from_vector_load: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: insertps $48, (%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_vector_load: +; X64: ## BB#0: +; X64-NEXT: insertps $48, (%rdi), %xmm0 +; X64-NEXT: retq %1 = load <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48) ret <4 x float> %2 } ;; Use a non-zero CountS for insertps -define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { -; CHECK-LABEL: insertps_from_vector_load_offset: -; On X32, account for the argument's move to registers -; X32: movl 4(%esp), %eax -; CHECK-NOT: mov ;; Try to match a bit more of the instr, since we need the load's offset. -; CHECK: insertps $96, 4(%{{...}}), % -; CHECK-NEXT: ret +define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +; X32-LABEL: insertps_from_vector_load_offset: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: insertps $96, 4(%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_vector_load_offset: +; X64: ## BB#0: +; X64-NEXT: insertps $96, 4(%rdi), %xmm0 +; X64-NEXT: retq %1 = load <4 x float>* %pb, align 16 %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96) ret <4 x float> %2 } -define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { -; CHECK-LABEL: insertps_from_vector_load_offset_2: -; On X32, account for the argument's move to registers -; X32: movl 4(%esp), %eax -; X32: movl 8(%esp), %ecx -; CHECK-NOT: mov ;; Try to match a bit more of the instr, since we need the load's offset. -; CHECK: insertps $192, 12(%{{...}},%{{...}}), % -; CHECK-NEXT: ret +define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) { +; X32-LABEL: insertps_from_vector_load_offset_2: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: shll $4, %ecx +; X32-NEXT: insertps $-64, 12(%eax,%ecx), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_vector_load_offset_2: +; X64: ## BB#0: +; X64-NEXT: shlq $4, %rsi +; X64-NEXT: insertps $-64, 12(%rdi,%rsi), %xmm0 +; X64-NEXT: retq %1 = getelementptr inbounds <4 x float>* %pb, i64 %index %2 = load <4 x float>* %1, align 16 %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192) @@ -627,13 +866,21 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa } define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) { -; CHECK-LABEL: insertps_from_broadcast_loadf32: -; On X32, account for the arguments' move to registers -; X32: movl 8(%esp), %eax -; X32: movl 4(%esp), %ecx -; CHECK-NOT: mov -; CHECK: insertps $48 -; CHECK-NEXT: ret +; X32-LABEL: insertps_from_broadcast_loadf32: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movss (%ecx,%eax,4), %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_broadcast_loadf32: +; X64: ## BB#0: +; X64-NEXT: movss (%rdi,%rsi,4), %xmm1 +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: retq %1 = getelementptr inbounds float* %fb, i64 %index %2 = load float* %1, align 4 %3 = insertelement <4 x float> undef, float %2, i32 0 @@ -645,12 +892,20 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap } define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) { -; CHECK-LABEL: insertps_from_broadcast_loadv4f32: -; On X32, account for the arguments' move to registers -; X32: movl 4(%esp), %{{...}} -; CHECK-NOT: mov -; CHECK: insertps $48 -; CHECK-NEXT: ret +; X32-LABEL: insertps_from_broadcast_loadv4f32: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movups (%eax), %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_broadcast_loadv4f32: +; X64: ## BB#0: +; X64-NEXT: movups (%rdi), %xmm1 +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: retq %1 = load <4 x float>* %b, align 4 %2 = extractelement <4 x float> %1, i32 0 %3 = insertelement <4 x float> undef, float %2, i32 0 @@ -663,20 +918,33 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float ;; FIXME: We're emitting an extraneous pshufd/vbroadcast. define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { -; CHECK-LABEL: insertps_from_broadcast_multiple_use: -; On X32, account for the arguments' move to registers -; X32: movl 8(%esp), %eax -; X32: movl 4(%esp), %ecx -; CHECK: movss -; CHECK-NOT: mov -; CHECK: insertps $48 -; CHECK: insertps $48 -; CHECK: insertps $48 -; CHECK: insertps $48 -; CHECK: addps -; CHECK: addps -; CHECK: addps -; CHECK-NEXT: ret +; X32-LABEL: insertps_from_broadcast_multiple_use: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movss (%ecx,%eax,4), %xmm4 +; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] +; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] +; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] +; X32-NEXT: addps %xmm1, %xmm0 +; X32-NEXT: addps %xmm2, %xmm3 +; X32-NEXT: addps %xmm3, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_from_broadcast_multiple_use: +; X64: ## BB#0: +; X64-NEXT: movss (%rdi,%rsi,4), %xmm4 +; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] +; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] +; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] +; X64-NEXT: addps %xmm1, %xmm0 +; X64-NEXT: addps %xmm2, %xmm3 +; X64-NEXT: addps %xmm3, %xmm0 +; X64-NEXT: retq %1 = getelementptr inbounds float* %fb, i64 %index %2 = load float* %1, align 4 %3 = insertelement <4 x float> undef, float %2, i32 0 @@ -694,10 +962,20 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl } define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) { -; CHECK-LABEL: insertps_with_undefs: -; CHECK-NOT: shufps -; CHECK: insertps $32, %xmm0 -; CHECK: ret +; X32-LABEL: insertps_with_undefs: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movss (%eax), %xmm1 +; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3] +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_with_undefs: +; X64: ## BB#0: +; X64-NEXT: movss (%rdi), %xmm1 +; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm0[0],xmm1[3] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq %1 = load float* %b, align 4 %2 = insertelement <4 x float> undef, float %1, i32 0 %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 4, i32 undef, i32 0, i32 7> @@ -707,9 +985,16 @@ define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) { ; Test for a bug in X86ISelLowering.cpp:getINSERTPS where we were using ; the destination index to change the load, instead of the source index. define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) { -; CHECK-LABEL: pr20087: -; CHECK: insertps $48 -; CHECK: ret +; X32-LABEL: pr20087: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: insertps $-78, 8(%eax), %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: pr20087: +; X64: ## BB#0: +; X64-NEXT: insertps $-78, 8(%rdi), %xmm0 +; X64-NEXT: retq %load = load <4 x float> *%ptr %ret = shufflevector <4 x float> %load, <4 x float> %a, <4 x i32> <i32 4, i32 undef, i32 6, i32 2> ret <4 x float> %ret @@ -717,18 +1002,201 @@ define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) { ; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1> define void @insertps_pr20411(i32* noalias nocapture %RET) #1 { -; CHECK-LABEL: insertps_pr20411: -; CHECK: movaps {{[^,]*}}, %[[REG1:xmm.]] -; CHECK: pshufd {{.*}} ## [[REG2:xmm.]] = mem[3,0,0,0] -; CHECK: insertps {{.*}} ## xmm1 = [[REG2]][0],[[REG1]][3]{{.*}} - +; X32-LABEL: insertps_pr20411: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; X32-NEXT: insertps $-36, LCPI49_1+12, %xmm0 +; X32-NEXT: movups %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: insertps_pr20411: +; X64: ## BB#0: +; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; X64-NEXT: insertps $-36, LCPI49_1+{{.*}}(%rip), %xmm0 +; X64-NEXT: movups %xmm0, (%rdi) +; X64-NEXT: retq %gather_load = shufflevector <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> %shuffle109 = shufflevector <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; 4 5 6 7 - %shuffle116 = shufflevector <8 x i32> %gather_load, <8 x i32> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> ; 3 x x x %shuffle117 = shufflevector <4 x i32> %shuffle109, <4 x i32> %shuffle116, <4 x i32> <i32 4, i32 3, i32 undef, i32 undef> ; 3 7 x x - %ptrcast = bitcast i32* %RET to <4 x i32>* store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4 ret void } + +define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_4: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_4: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 + %vecext2 = extractelement <4 x float> %B, i32 2 + %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_5: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_5: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %B, i32 1 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_6: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_6: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 1 + %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 + %vecext1 = extractelement <4 x float> %B, i32 2 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit3 +} + +define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_7: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_7: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1 + %vecext2 = extractelement <4 x float> %B, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_8: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_8: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 + %vecext1 = extractelement <4 x float> %B, i32 0 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2 + %vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit4 +} + +define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) { +; X32-LABEL: insertps_9: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero +; X32-NEXT: movaps %xmm1, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: insertps_9: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 0 + %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1 + %vecext1 = extractelement <4 x float> %B, i32 2 + %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2 + %vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3 + ret <4 x float> %vecinit3 +} + +define <4 x float> @insertps_10(<4 x float> %A) +{ +; X32-LABEL: insertps_10: +; X32: ## BB#0: +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero +; X32-NEXT: retl +; +; X64-LABEL: insertps_10: +; X64: ## BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero +; X64-NEXT: retq + %vecext = extractelement <4 x float> %A, i32 0 + %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0 + %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2 + ret <4 x float> %vecbuild2 +} + +define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) { +; X32-LABEL: build_vector_to_shuffle_1: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm1, %xmm1 +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; X32-NEXT: retl +; +; X64-LABEL: build_vector_to_shuffle_1: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 1 + %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 + %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x float> %vecinit3 +} + +define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) { +; X32-LABEL: build_vector_to_shuffle_2: +; X32: ## BB#0: +; X32-NEXT: xorps %xmm1, %xmm1 +; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; X32-NEXT: retl +; +; X64-LABEL: build_vector_to_shuffle_2: +; X64: ## BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; X64-NEXT: retq +entry: + %vecext = extractelement <4 x float> %A, i32 1 + %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1 + %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2 + ret <4 x float> %vecinit1 +} diff --git a/test/CodeGen/X86/sse_partial_update.ll b/test/CodeGen/X86/sse_partial_update.ll index 2c16a554aebb..a88ab014641b 100644 --- a/test/CodeGen/X86/sse_partial_update.ll +++ b/test/CodeGen/X86/sse_partial_update.ll @@ -5,11 +5,18 @@ ; There is a mismatch between the intrinsic and the actual instruction. ; The actual instruction has a partial update of dest, while the intrinsic ; passes through the upper FP values. Here, we make sure the source and -; destination of rsqrtss are the same. -define void @t1(<4 x float> %a) nounwind uwtable ssp { +; destination of each scalar unary op are the same. + +define void @rsqrtss(<4 x float> %a) nounwind uwtable ssp { entry: -; CHECK-LABEL: t1: +; CHECK-LABEL: rsqrtss: ; CHECK: rsqrtss %xmm0, %xmm0 +; CHECK-NEXT: cvtss2sd %xmm0 +; CHECK-NEXT: shufps +; CHECK-NEXT: cvtss2sd %xmm0 +; CHECK-NEXT: movap +; CHECK-NEXT: jmp + %0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind %a.addr.0.extract = extractelement <4 x float> %0, i32 0 %conv = fpext float %a.addr.0.extract to double @@ -21,10 +28,16 @@ entry: declare void @callee(double, double) declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone -define void @t2(<4 x float> %a) nounwind uwtable ssp { +define void @rcpss(<4 x float> %a) nounwind uwtable ssp { entry: -; CHECK-LABEL: t2: +; CHECK-LABEL: rcpss: ; CHECK: rcpss %xmm0, %xmm0 +; CHECK-NEXT: cvtss2sd %xmm0 +; CHECK-NEXT: shufps +; CHECK-NEXT: cvtss2sd %xmm0 +; CHECK-NEXT: movap +; CHECK-NEXT: jmp + %0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind %a.addr.0.extract = extractelement <4 x float> %0, i32 0 %conv = fpext float %a.addr.0.extract to double @@ -34,3 +47,23 @@ entry: ret void } declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone + +define void @sqrtss(<4 x float> %a) nounwind uwtable ssp { +entry: +; CHECK-LABEL: sqrtss: +; CHECK: sqrtss %xmm0, %xmm0 +; CHECK-NEXT: cvtss2sd %xmm0 +; CHECK-NEXT: shufps +; CHECK-NEXT: cvtss2sd %xmm0 +; CHECK-NEXT: movap +; CHECK-NEXT: jmp + + %0 = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a) nounwind + %a.addr.0.extract = extractelement <4 x float> %0, i32 0 + %conv = fpext float %a.addr.0.extract to double + %a.addr.4.extract = extractelement <4 x float> %0, i32 1 + %conv3 = fpext float %a.addr.4.extract to double + tail call void @callee(double %conv, double %conv3) nounwind + ret void +} +declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone diff --git a/test/CodeGen/X86/stack-probe-size.ll b/test/CodeGen/X86/stack-probe-size.ll new file mode 100644 index 000000000000..21482c3abded --- /dev/null +++ b/test/CodeGen/X86/stack-probe-size.ll @@ -0,0 +1,78 @@ +; This test is attempting to detect that the compiler correctly generates stack +; probe calls when the size of the local variables exceeds the specified stack +; probe size. +; +; Testing the default value of 4096 bytes makes sense, because the default +; stack probe size equals the page size (4096 bytes for all x86 targets), and +; this is unlikely to change in the future. +; +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" +target triple = "i686-pc-windows-msvc" + +define i32 @test1() "stack-probe-size"="0" { + %buffer = alloca [4095 x i8] + + ret i32 0 + +; CHECK-LABEL: _test1: +; CHECK-NOT: subl $4095, %esp +; CHECK: movl $4095, %eax +; CHECK: calll __chkstk +} + +define i32 @test2() { + %buffer = alloca [4095 x i8] + + ret i32 0 + +; CHECK-LABEL: _test2: +; CHECK-NOT: movl $4095, %eax +; CHECK: subl $4095, %esp +; CHECK-NOT: calll __chkstk +} + +define i32 @test3() "stack-probe-size"="8192" { + %buffer = alloca [4095 x i8] + + ret i32 0 + +; CHECK-LABEL: _test3: +; CHECK-NOT: movl $4095, %eax +; CHECK: subl $4095, %esp +; CHECK-NOT: calll __chkstk +} + +define i32 @test4() "stack-probe-size"="0" { + %buffer = alloca [4096 x i8] + + ret i32 0 + +; CHECK-LABEL: _test4: +; CHECK-NOT: subl $4096, %esp +; CHECK: movl $4096, %eax +; CHECK: calll __chkstk +} + +define i32 @test5() { + %buffer = alloca [4096 x i8] + + ret i32 0 + +; CHECK-LABEL: _test5: +; CHECK-NOT: subl $4096, %esp +; CHECK: movl $4096, %eax +; CHECK: calll __chkstk +} + +define i32 @test6() "stack-probe-size"="8192" { + %buffer = alloca [4096 x i8] + + ret i32 0 + +; CGECK-LABEL: _test6: +; CGECK-NOT: movl $4096, %eax +; CGECK: subl $4096, %esp +; CGECK-NOT: calll __chkstk +} diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll index cf88ade9363d..a84b77eac5f6 100644 --- a/test/CodeGen/X86/stack-protector-dbginfo.ll +++ b/test/CodeGen/X86/stack-protector-dbginfo.ll @@ -10,88 +10,88 @@ ; Function Attrs: nounwind sspreq define i32 @_Z18read_response_sizev() #0 { entry: - tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23), !dbg !39 + tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23, metadata !{!"0x102"}), !dbg !39 %0 = load i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0), align 8, !dbg !40 - tail call void @llvm.dbg.value(metadata !63, i64 0, metadata !64), !dbg !71 + tail call void @llvm.dbg.value(metadata i32 undef, i64 0, metadata !64, metadata !{!"0x102"}), !dbg !71 %1 = trunc i64 %0 to i32 ret i32 %1 } ; Function Attrs: nounwind readnone -declare void @llvm.dbg.value(metadata, i64, metadata) +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) attributes #0 = { sspreq } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!21, !72} -!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 true, metadata !"", i32 0, metadata !2, metadata !5, metadata !8, metadata !20, metadata !5, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus] -!1 = metadata !{metadata !"<unknown>", metadata !"/Users/matt/ryan_bug"} -!2 = metadata !{metadata !3} -!3 = metadata !{i32 786436, metadata !1, metadata !4, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ] -!4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 19, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ] -!5 = metadata !{} -!6 = metadata !{metadata !7} -!7 = metadata !{i32 786472, metadata !"max_frame_size", i64 0} ; [ DW_TAG_enumerator ] [max_frame_size :: 0] -!8 = metadata !{metadata !9, metadata !24, metadata !41, metadata !65} -!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"read_response_size", metadata !"read_response_size", metadata !"_Z18read_response_sizev", i32 27, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_Z18read_response_sizev, null, null, metadata !14, i32 27} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size] -!10 = metadata !{i32 786473, metadata !1} ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>] -!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!12 = metadata !{metadata !13} -!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed] -!14 = metadata !{metadata !15, metadata !19} -!15 = metadata !{i32 786688, metadata !9, metadata !"b", metadata !10, i32 28, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 28] -!16 = metadata !{i32 786451, metadata !1, null, metadata !"B", i32 16, i64 32, i64 32, i32 0, i32 0, null, metadata !17, i32 0, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ] -!17 = metadata !{metadata !18} -!18 = metadata !{i32 786445, metadata !1, metadata !16, metadata !"end_of_file", i32 17, i64 32, i64 32, i64 0, i32 0, metadata !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int] -!19 = metadata !{i32 786688, metadata !9, metadata !"c", metadata !10, i32 29, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [c] [line 29] -!20 = metadata !{} -!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2} -!22 = metadata !{i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0)} -!23 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, metadata !38} ; [ DW_TAG_arg_variable ] [p2] [line 12] -!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long>", metadata !"min<unsigned long long>", metadata !"_ZN3__13minIyEERKT_S3_RS1_", i32 12, metadata !27, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !33, null, metadata !35, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>] -!25 = metadata !{i32 786489, metadata !26, null, metadata !"__1", i32 1} ; [ DW_TAG_namespace ] [__1] [line 1] -!26 = metadata !{metadata !"main.cpp", metadata !"/Users/matt/ryan_bug"} -!27 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!28 = metadata !{metadata !29, metadata !29, metadata !32} -!29 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ] -!30 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int] -!31 = metadata !{i32 786468, null, null, metadata !"long long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned] -!32 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int] -!33 = metadata !{metadata !34} -!34 = metadata !{i32 786479, null, metadata !"_Tp", metadata !31, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ] -!35 = metadata !{metadata !36, metadata !37} -!36 = metadata !{i32 786689, metadata !24, metadata !"p1", metadata !10, i32 16777228, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 12] -!37 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 12] -!38 = metadata !{i32 33, i32 0, metadata !9, null} -!39 = metadata !{i32 12, i32 0, metadata !24, metadata !38} -!40 = metadata !{i32 9, i32 0, metadata !41, metadata !59} -!41 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long, __1::A>", metadata !"min<unsigned long long, __1::A>", metadata !"_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_", i32 7, metadata !42, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !53, null, metadata !55, i32 8} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>] -!42 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !43, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!43 = metadata !{metadata !29, metadata !29, metadata !32, metadata !44} -!44 = metadata !{i32 786451, metadata !1, metadata !25, metadata !"A", i32 0, i64 8, i64 8, i32 0, i32 0, null, metadata !45, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ] -!45 = metadata !{metadata !46} -!46 = metadata !{i32 786478, metadata !1, metadata !44, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !52, i32 1} ; [ DW_TAG_subprogram ] [line 1] [operator()] -!47 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !48, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!48 = metadata !{metadata !13, metadata !49, metadata !50, metadata !50} -!49 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A] -!50 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ] -!51 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int] -!52 = metadata !{i32 786468} -!53 = metadata !{metadata !34, metadata !54} -!54 = metadata !{i32 786479, null, metadata !"_Compare", metadata !44, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ] -!55 = metadata !{metadata !56, metadata !57, metadata !58} -!56 = metadata !{i32 786689, metadata !41, metadata !"p1", metadata !10, i32 16777223, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 7] -!57 = metadata !{i32 786689, metadata !41, metadata !"p2", metadata !10, i32 33554439, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 7] -!58 = metadata !{i32 786689, metadata !41, metadata !"p3", metadata !10, i32 50331656, metadata !44, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p3] [line 8] -!59 = metadata !{i32 13, i32 0, metadata !24, metadata !38} -!63 = metadata !{i32 undef} -!64 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, metadata !40} ; [ DW_TAG_arg_variable ] [p1] [line 1] -!65 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !46, metadata !66, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()] -!66 = metadata !{metadata !67, metadata !69, metadata !70} -!67 = metadata !{i32 786689, metadata !65, metadata !"this", null, i32 16777216, metadata !68, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0] -!68 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A] -!69 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 1] -!70 = metadata !{i32 786689, metadata !65, metadata !"", metadata !10, i32 50331650, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 2] -!71 = metadata !{i32 1, i32 0, metadata !65, metadata !40} -!72 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x11\004\00clang version 3.4 \001\00\000\00\001", !1, !2, !5, !8, !20, !5} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus] +!1 = !{!"<unknown>", !"/Users/matt/ryan_bug"} +!2 = !{!3} +!3 = !{!"0x4\00\0020\0032\0032\000\000\000", !1, !4, null, !6, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ] +!4 = !{!"0x13\00C\0019\008\008\000\000\000", !1, null, null, !5, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ] +!5 = !{} +!6 = !{!7} +!7 = !{!"0x28\00max_frame_size\000"} ; [ DW_TAG_enumerator ] [max_frame_size :: 0] +!8 = !{!9, !24, !41, !65} +!9 = !{!"0x2e\00read_response_size\00read_response_size\00_Z18read_response_sizev\0027\000\001\000\006\00256\001\0027", !1, !10, !11, null, i32 ()* @_Z18read_response_sizev, null, null, !14} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size] +!10 = !{!"0x29", !1} ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>] +!11 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !12, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!12 = !{!13} +!13 = !{!"0x24\00int\000\0032\0032\000\000\005", null, null} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed] +!14 = !{!15, !19} +!15 = !{!"0x100\00b\0028\000", !9, !10, !16} ; [ DW_TAG_auto_variable ] [b] [line 28] +!16 = !{!"0x13\00B\0016\0032\0032\000\000\000", !1, null, null, !17, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ] +!17 = !{!18} +!18 = !{!"0xd\00end_of_file\0017\0032\0032\000\000", !1, !16, !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int] +!19 = !{!"0x100\00c\0029\000", !9, !10, !13} ; [ DW_TAG_auto_variable ] [c] [line 29] +!20 = !{} +!21 = !{i32 2, !"Dwarf Version", i32 2} +!22 = !{i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0)} +!23 = !{!"0x101\00p2\0033554444\000", !24, !10, !32, !38} ; [ DW_TAG_arg_variable ] [p2] [line 12] +!24 = !{!"0x2e\00min<unsigned long long>\00min<unsigned long long>\00_ZN3__13minIyEERKT_S3_RS1_\0012\000\001\000\006\00256\001\0012", !1, !25, !27, null, null, !33, null, !35} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>] +!25 = !{!"0x39\00__1\001", !26, null} ; [ DW_TAG_namespace ] [__1] [line 1] +!26 = !{!"main.cpp", !"/Users/matt/ryan_bug"} +!27 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !28, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!28 = !{!29, !29, !32} +!29 = !{!"0x10\00\000\000\000\000\000", null, null, !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ] +!30 = !{!"0x26\00\000\000\000\000\000", null, null, !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int] +!31 = !{!"0x24\00long long unsigned int\000\0064\0064\000\000\007", null, null} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned] +!32 = !{!"0x10\00\000\000\000\000\000", null, null, !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int] +!33 = !{!34} +!34 = !{!"0x2f\00_Tp\000\000", null, !31, null} ; [ DW_TAG_template_type_parameter ] +!35 = !{!36, !37} +!36 = !{!"0x101\00p1\0016777228\000", !24, !10, !29} ; [ DW_TAG_arg_variable ] [p1] [line 12] +!37 = !{!"0x101\00p2\0033554444\000", !24, !10, !32} ; [ DW_TAG_arg_variable ] [p2] [line 12] +!38 = !MDLocation(line: 33, scope: !9) +!39 = !MDLocation(line: 12, scope: !24, inlinedAt: !38) +!40 = !MDLocation(line: 9, scope: !41, inlinedAt: !59) +!41 = !{!"0x2e\00min<unsigned long long, __1::A>\00min<unsigned long long, __1::A>\00_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_\007\000\001\000\006\00256\001\008", !1, !25, !42, null, null, !53, null, !55} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>] +!42 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !43, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!43 = !{!29, !29, !32, !44} +!44 = !{!"0x13\00A\000\008\008\000\000\000", !1, !25, null, !45, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ] +!45 = !{!46} +!46 = !{!"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\000\000\006\00256\001\001", !1, !44, !47, null, null, null, i32 0, !52} ; [ DW_TAG_subprogram ] [line 1] [operator()] +!47 = !{!"0x15\00\000\000\000\000\000\000", i32 0, null, null, !48, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!48 = !{!13, !49, !50, !50} +!49 = !{!"0xf\00\000\0064\0064\000\001088", i32 0, null, !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A] +!50 = !{!"0x10\00\000\000\000\000\000", null, null, !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ] +!51 = !{!"0x26\00\000\000\000\000\000", null, null, !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int] +!52 = !{i32 786468} +!53 = !{!34, !54} +!54 = !{!"0x2f\00_Compare\000\000", null, !44, null} ; [ DW_TAG_template_type_parameter ] +!55 = !{!56, !57, !58} +!56 = !{!"0x101\00p1\0016777223\000", !41, !10, !29} ; [ DW_TAG_arg_variable ] [p1] [line 7] +!57 = !{!"0x101\00p2\0033554439\000", !41, !10, !32} ; [ DW_TAG_arg_variable ] [p2] [line 7] +!58 = !{!"0x101\00p3\0050331656\000", !41, !10, !44} ; [ DW_TAG_arg_variable ] [p3] [line 8] +!59 = !MDLocation(line: 13, scope: !24, inlinedAt: !38) +!63 = !{i32 undef} +!64 = !{!"0x101\00p1\0033554433\000", !65, !10, !50, !40} ; [ DW_TAG_arg_variable ] [p1] [line 1] +!65 = !{!"0x2e\00operator()\00operator()\00_ZN3__11AclERKiS2_\001\000\001\000\006\00256\001\002", !1, !25, !47, null, null, null, !46, !66} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()] +!66 = !{!67, !69, !70} +!67 = !{!"0x101\00this\0016777216\001088", !65, null, !68} ; [ DW_TAG_arg_variable ] [this] [line 0] +!68 = !{!"0xf\00\000\0064\0064\000\000", null, null, !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A] +!69 = !{!"0x101\00p1\0033554433\000", !65, !10, !50} ; [ DW_TAG_arg_variable ] [p1] [line 1] +!70 = !{!"0x101\00\0050331650\000", !65, !10, !50} ; [ DW_TAG_arg_variable ] [line 2] +!71 = !MDLocation(line: 1, scope: !65, inlinedAt: !40) +!72 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll new file mode 100644 index 000000000000..c5bf49134e4b --- /dev/null +++ b/test/CodeGen/X86/stack-protector-weight.ll @@ -0,0 +1,36 @@ +; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=true %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=SELDAG +; RUN: llc -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=IR + +; SELDAG: # Machine code for function test_branch_weights: +; SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](1048575) BB#[[FAILURE:[0-9]+]](1) +; SELDAG: BB#[[FAILURE]]: +; SELDAG: CALL64pcrel32 <es:__stack_chk_fail> +; SELDAG: BB#[[SUCCESS]]: + +; IR: # Machine code for function test_branch_weights: +; IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]](1048575) BB#[[FAILURE:[0-9]+]](1) +; IR: BB#[[SUCCESS]]: +; IR: BB#[[FAILURE]]: +; IR: CALL64pcrel32 <ga:@__stack_chk_fail> + +define i32 @test_branch_weights(i32 %n) #0 { +entry: + %a = alloca [128 x i32], align 16 + %0 = bitcast [128 x i32]* %a to i8* + call void @llvm.lifetime.start(i64 512, i8* %0) + %arraydecay = getelementptr inbounds [128 x i32]* %a, i64 0, i64 0 + call void @foo2(i32* %arraydecay) + %idxprom = sext i32 %n to i64 + %arrayidx = getelementptr inbounds [128 x i32]* %a, i64 0, i64 %idxprom + %1 = load i32* %arrayidx, align 4 + call void @llvm.lifetime.end(i64 512, i8* %0) + ret i32 %1 +} + +declare void @llvm.lifetime.start(i64, i8* nocapture) + +declare void @foo2(i32*) + +declare void @llvm.lifetime.end(i64, i8* nocapture) + +attributes #0 = { ssp "stack-protector-buffer-size"="8" } diff --git a/test/CodeGen/X86/stack_guard_remat.ll b/test/CodeGen/X86/stack_guard_remat.ll new file mode 100644 index 000000000000..dd639a7c7b4c --- /dev/null +++ b/test/CodeGen/X86/stack_guard_remat.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s -check-prefix=CHECK + +;CHECK: foo2 +;CHECK: movq ___stack_chk_guard@GOTPCREL(%rip), [[R0:%[a-z0-9]+]] +;CHECK: movq ([[R0]]), {{%[a-z0-9]+}} + +; Function Attrs: nounwind ssp uwtable +define i32 @test_stack_guard_remat() #0 { +entry: + %a1 = alloca [256 x i32], align 16 + %0 = bitcast [256 x i32]* %a1 to i8* + call void @llvm.lifetime.start(i64 1024, i8* %0) + %arraydecay = getelementptr inbounds [256 x i32]* %a1, i64 0, i64 0 + call void @foo3(i32* %arraydecay) + call void asm sideeffect "foo2", "~{r12},~{r13},~{r14},~{r15},~{ebx},~{esi},~{edi},~{dirflag},~{fpsr},~{flags}"() + call void @llvm.lifetime.end(i64 1024, i8* %0) + ret i32 0 +} + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) + +declare void @foo3(i32*) + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) + +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/X86/stackmap-fast-isel.ll b/test/CodeGen/X86/stackmap-fast-isel.ll index 0b7e6dbdc7a2..dfb16adaa339 100644 --- a/test/CodeGen/X86/stackmap-fast-isel.ll +++ b/test/CodeGen/X86/stackmap-fast-isel.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim -fast-isel -fast-isel-abort | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -fast-isel -fast-isel-abort | FileCheck %s ; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps ; CHECK-NEXT: __LLVM_StackMaps: diff --git a/test/CodeGen/X86/stackmap-large-constants.ll b/test/CodeGen/X86/stackmap-large-constants.ll new file mode 100644 index 000000000000..73ee4f3d1569 --- /dev/null +++ b/test/CodeGen/X86/stackmap-large-constants.ll @@ -0,0 +1,83 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s + +; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps +; CHECK-NEXT: __LLVM_StackMaps: +; version +; CHECK-NEXT: .byte 1 +; reserved +; CHECK-NEXT: .byte 0 +; reserved +; CHECK-NEXT: .short 0 +; # functions +; CHECK-NEXT: .long 2 +; # constants +; CHECK-NEXT: .long 2 +; # records +; CHECK-NEXT: .long 2 +; function address & stack size +; CHECK-NEXT: .quad _foo +; CHECK-NEXT: .quad 8 +; function address & stack size +; CHECK-NEXT: .quad _bar +; CHECK-NEXT: .quad 8 + +; Constants Array: +; CHECK-NEXT: .quad 9223372036854775807 +; CHECK-NEXT: .quad -9223372036854775808 + +; Patchpoint ID +; CHECK-NEXT: .quad 0 +; Instruction offset +; CHECK-NEXT: .long L{{.*}}-_foo +; reserved +; CHECK-NEXT: .short 0 +; # locations +; CHECK-NEXT: .short 1 +; ConstantIndex +; CHECK-NEXT: .byte 5 +; reserved +; CHECK-NEXT: .byte 8 +; Dwarf RegNum +; CHECK-NEXT: .short 0 +; Offset +; CHECK-NEXT: .long 0 +; padding +; CHECK-NEXT: .short 0 +; NumLiveOuts +; CHECK-NEXT: .short 0 + +; CHECK-NEXT: .align 3 + +declare void @llvm.experimental.stackmap(i64, i32, ...) + +define void @foo() { + tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 9223372036854775807) + ret void +} + +; Patchpoint ID +; CHECK-NEXT: .quad 0 +; Instruction Offset +; CHECK-NEXT: .long L{{.*}}-_bar +; reserved +; CHECK-NEXT: .short 0 +; # locations +; CHECK-NEXT: .short 1 +; ConstantIndex +; CHECK-NEXT: .byte 5 +; reserved +; CHECK-NEXT: .byte 8 +; Dwarf RegNum +; CHECK-NEXT: .short 0 +; Offset +; CHECK-NEXT: .long 1 +; padding +; CHECK-NEXT: .short 0 +; NumLiveOuts +; CHECK-NEXT: .short 0 + + +define void @bar() { + tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0, i64 -9223372036854775808) + ret void +} diff --git a/test/CodeGen/X86/stackmap-liveness.ll b/test/CodeGen/X86/stackmap-liveness.ll index 897595db2438..31553c0b6842 100644 --- a/test/CodeGen/X86/stackmap-liveness.ll +++ b/test/CodeGen/X86/stackmap-liveness.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim -enable-patchpoint-liveness=false | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim | FileCheck -check-prefix=PATCH %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -enable-patchpoint-liveness=false | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck -check-prefix=PATCH %s ; ; Note: Print verbose stackmaps using -debug-only=stackmaps. diff --git a/test/CodeGen/X86/stackmap-nops.ll b/test/CodeGen/X86/stackmap-nops.ll index 5a78f24d7b5e..7932c0dfb99d 100644 --- a/test/CodeGen/X86/stackmap-nops.ll +++ b/test/CodeGen/X86/stackmap-nops.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s define void @nop_test() { entry: @@ -224,6 +224,10 @@ entry: tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 28, i32 28) tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 29, i32 29) tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 30, i32 30) +; Add an extra stackmap with a zero-length shadow to thwart the shadow +; optimization. This will force all 15 bytes of the previous shadow to be +; padded with nops. + tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 31, i32 0) ret void } diff --git a/test/CodeGen/X86/stackmap-shadow-optimization.ll b/test/CodeGen/X86/stackmap-shadow-optimization.ll new file mode 100644 index 000000000000..a3725f2c5b72 --- /dev/null +++ b/test/CodeGen/X86/stackmap-shadow-optimization.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s + +; Check that the X86 stackmap shadow optimization is only outputting a 3-byte +; nop here. 8-bytes are requested, but 5 are covered by the code for the call to +; bar. However, the frame teardown and the return do not count towards the +; stackmap shadow as the call return counts as a branch target so must flush +; the shadow. +; Note that in order for a thread to not return in to the patched space +; the call must be at the end of the shadow, so the required nop must be +; before the call, not after. +define void @shadow_optimization_test() { +entry: +; CHECK-LABEL: shadow_optimization_test: +; CHECK: callq _bar +; CHECK: nop +; CHECK: callq _bar +; CHECK-NOT: nop +; CHECK: callq _bar +; CHECK-NOT: nop + call void @bar() + tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 8) + call void @bar() + call void @bar() + ret void +} + +declare void @llvm.experimental.stackmap(i64, i32, ...) +declare void @bar() diff --git a/test/CodeGen/X86/stackmap.ll b/test/CodeGen/X86/stackmap.ll index 85670370d870..5e356f3e03d1 100644 --- a/test/CodeGen/X86/stackmap.ll +++ b/test/CodeGen/X86/stackmap.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s ; ; Note: Print verbose stackmaps using -debug-only=stackmaps. @@ -9,11 +9,11 @@ ; CHECK-NEXT: .byte 0 ; CHECK-NEXT: .short 0 ; Num Functions -; CHECK-NEXT: .long 15 +; CHECK-NEXT: .long 16 ; Num LargeConstants ; CHECK-NEXT: .long 3 ; Num Callsites -; CHECK-NEXT: .long 19 +; CHECK-NEXT: .long 20 ; Functions and stack size ; CHECK-NEXT: .quad _constantargs @@ -46,6 +46,8 @@ ; CHECK-NEXT: .quad 8 ; CHECK-NEXT: .quad _clobberScratch ; CHECK-NEXT: .quad 56 +; CHECK-NEXT: .quad _needsStackRealignment +; CHECK-NEXT: .quad -1 ; Large Constants ; CHECK-NEXT: .quad 2147483648 @@ -464,6 +466,23 @@ define void @clobberScratch(i32 %a) { ret void } +; A stack frame which needs to be realigned at runtime (to meet alignment +; criteria for values on the stack) does not have a fixed frame size. +; CHECK-LABEL: .long L{{.*}}-_needsStackRealignment +; CHECK-NEXT: .short 0 +; 0 locations +; CHECK-NEXT: .short 0 +define void @needsStackRealignment() { + %val = alloca i64, i32 3, align 128 + tail call void (...)* @escape_values(i64* %val) +; Note: Adding any non-constant to the stackmap would fail because we +; expected to be able to address off the frame pointer. In a realigned +; frame, we must use the stack pointer instead. This is a separate bug. + tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 0, i32 0) + ret void +} +declare void @escape_values(...) + declare void @llvm.experimental.stackmap(i64, i32, ...) declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...) declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...) diff --git a/test/CodeGen/X86/stackpointer.ll b/test/CodeGen/X86/stackpointer.ll index 80bcfbf16743..094856b3c57d 100644 --- a/test/CodeGen/X86/stackpointer.ll +++ b/test/CodeGen/X86/stackpointer.ll @@ -25,4 +25,4 @@ declare void @llvm.write_register.i64(metadata, i64) nounwind ; register unsigned long current_stack_pointer asm("rsp"); ; CHECK-NOT: .asciz "rsp" -!0 = metadata !{metadata !"rsp\00"} +!0 = !{!"rsp\00"} diff --git a/test/CodeGen/X86/statepoint-call-lowering.ll b/test/CodeGen/X86/statepoint-call-lowering.ll new file mode 100644 index 000000000000..e7a0dcab9ab7 --- /dev/null +++ b/test/CodeGen/X86/statepoint-call-lowering.ll @@ -0,0 +1,90 @@ +; RUN: llc < %s | FileCheck %s +; This file contains a collection of basic tests to ensure we didn't +; screw up normal call lowering when there are no deopt or gc arguments. + +target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +declare zeroext i1 @return_i1() +declare zeroext i32 @return_i32() +declare i32* @return_i32ptr() +declare float @return_float() + +define i1 @test_i1_return() { +; CHECK-LABEL: test_i1_return +; This is just checking that a i1 gets lowered normally when there's no extra +; state arguments to the statepoint +; CHECK: pushq %rax +; CHECK: callq return_i1 +; CHECK: popq %rdx +; CHECK: retq +entry: + %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0) + %call1 = call zeroext i1 @llvm.experimental.gc.result.int.i1(i32 %safepoint_token) + ret i1 %call1 +} + +define i32 @test_i32_return() { +; CHECK-LABEL: test_i32_return +; CHECK: pushq %rax +; CHECK: callq return_i32 +; CHECK: popq %rdx +; CHECK: retq +entry: + %safepoint_token = tail call i32 (i32 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i32f(i32 ()* @return_i32, i32 0, i32 0, i32 0) + %call1 = call zeroext i32 @llvm.experimental.gc.result.int.i32(i32 %safepoint_token) + ret i32 %call1 +} + +define i32* @test_i32ptr_return() { +; CHECK-LABEL: test_i32ptr_return +; CHECK: pushq %rax +; CHECK: callq return_i32ptr +; CHECK: popq %rdx +; CHECK: retq +entry: + %safepoint_token = tail call i32 (i32* ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_p0i32f(i32* ()* @return_i32ptr, i32 0, i32 0, i32 0) + %call1 = call i32* @llvm.experimental.gc.result.ptr.p0i32(i32 %safepoint_token) + ret i32* %call1 +} + +define float @test_float_return() { +; CHECK-LABEL: test_float_return +; CHECK: pushq %rax +; CHECK: callq return_float +; CHECK: popq %rax +; CHECK: retq +entry: + %safepoint_token = tail call i32 (float ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_f32f(float ()* @return_float, i32 0, i32 0, i32 0) + %call1 = call float @llvm.experimental.gc.result.float.f32(i32 %safepoint_token) + ret float %call1 +} + +define i1 @test_relocate(i32* %a) { +; CHECK-LABEL: test_relocate +; Check that an ununsed relocate has no code-generation impact +; CHECK: pushq %rax +; CHECK: callq return_i1 +; CHECK-NEXT: .Ltmp13: +; CHECK-NEXT: popq %rdx +; CHECK-NEXT: retq +entry: + %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 0, i32* %a) + %call1 = call i32* @llvm.experimental.gc.relocate.p0i32(i32 %safepoint_token, i32 4, i32 4) + %call2 = call zeroext i1 @llvm.experimental.gc.result.int.i1(i32 %safepoint_token) + ret i1 %call2 +} + +declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()*, i32, i32, ...) +declare i1 @llvm.experimental.gc.result.int.i1(i32) + +declare i32 @llvm.experimental.gc.statepoint.p0f_i32f(i32 ()*, i32, i32, ...) +declare i32 @llvm.experimental.gc.result.int.i32(i32) + +declare i32 @llvm.experimental.gc.statepoint.p0f_p0i32f(i32* ()*, i32, i32, ...) +declare i32* @llvm.experimental.gc.result.ptr.p0i32(i32) + +declare i32 @llvm.experimental.gc.statepoint.p0f_f32f(float ()*, i32, i32, ...) +declare float @llvm.experimental.gc.result.float.f32(i32) + +declare i32* @llvm.experimental.gc.relocate.p0i32(i32, i32, i32) diff --git a/test/CodeGen/X86/statepoint-forward.ll b/test/CodeGen/X86/statepoint-forward.ll new file mode 100644 index 000000000000..12a6ac2c72a9 --- /dev/null +++ b/test/CodeGen/X86/statepoint-forward.ll @@ -0,0 +1,107 @@ +; RUN: opt -O3 -S < %s | FileCheck --check-prefix=CHECK-OPT %s +; RUN: llc < %s | FileCheck --check-prefix=CHECK-LLC %s +; These tests are targetted at making sure we don't retain information +; about memory which contains potential gc references across a statepoint. +; They're carefully written to only outlaw forwarding of references. +; Depending on the collector, forwarding non-reference fields or +; constant null references may be perfectly legal. (If unimplemented.) +; The general structure of these tests is: +; - learn a fact about memory (via an assume) +; - cross a statepoint +; - check the same fact about memory (which we no longer know) + +target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +; If not at a statepoint, we could forward known memory values +; across this call. +declare void @func() readonly + +;; Forwarding the value of a pointer load is invalid since it may have +;; changed at the safepoint. Forwarding a non-gc pointer value would +;; be valid, but is not currently implemented. +define i1 @test_load_forward(i32 addrspace(1)* addrspace(1)* %p) gc "statepoint-example" { +entry: + %before = load i32 addrspace(1)* addrspace(1)* %p + %cmp1 = call i1 @f(i32 addrspace(1)* %before) + call void @llvm.assume(i1 %cmp1) + %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p) + %pnew = call i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(i32 %safepoint_token, i32 4, i32 4) + %after = load i32 addrspace(1)* addrspace(1)* %pnew + %cmp2 = call i1 @f(i32 addrspace(1)* %after) + ret i1 %cmp2 + +; CHECK-OPT-LABEL: test_load_forward +; CHECK-OPT: ret i1 %cmp2 +; CHECK-LLC-LABEL: test_load_forward +; CHECK-LLC: callq f +} + +;; Same as above, but forwarding from a store +define i1 @test_store_forward(i32 addrspace(1)* addrspace(1)* %p, + i32 addrspace(1)* %v) gc "statepoint-example" { +entry: + %cmp1 = call i1 @f(i32 addrspace(1)* %v) + call void @llvm.assume(i1 %cmp1) + store i32 addrspace(1)* %v, i32 addrspace(1)* addrspace(1)* %p + %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0, i32 addrspace(1)* addrspace(1)* %p) + %pnew = call i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(i32 %safepoint_token, i32 4, i32 4) + %after = load i32 addrspace(1)* addrspace(1)* %pnew + %cmp2 = call i1 @f(i32 addrspace(1)* %after) + ret i1 %cmp2 + +; CHECK-OPT-LABEL: test_store_forward +; CHECK-OPT: ret i1 %cmp2 +; CHECK-LLC-LABEL: test_store_forward +; CHECK-LLC: callq f +} + +; A predicate on the pointer which is not simply null, but whose value +; would be known unchanged if the pointer value could be forwarded. +; The implementation of such a function could inspect the integral value +; of the pointer and is thus not safe to reuse after a statepoint. +declare i1 @f(i32 addrspace(1)* %v) readnone + +; This is a variant of the test_load_forward test which is intended to +; highlight the fact that a gc pointer can be stored in part of the heap +; that is not itself GC managed. The GC may have an external mechanism +; to know about and update that value at a safepoint. Note that the +; statepoint does not provide the collector with this root. +define i1 @test_load_forward_nongc_heap(i32 addrspace(1)** %p) gc "statepoint-example" { +entry: + %before = load i32 addrspace(1)** %p + %cmp1 = call i1 @f(i32 addrspace(1)* %before) + call void @llvm.assume(i1 %cmp1) + call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0) + %after = load i32 addrspace(1)** %p + %cmp2 = call i1 @f(i32 addrspace(1)* %after) + ret i1 %cmp2 + +; CHECK-OPT-LABEL: test_load_forward_nongc_heap +; CHECK-OPT: ret i1 %cmp2 +; CHECK-LLC-LABEL: test_load_forward_nongc_heap +; CHECK-LLC: callq f +} + +;; Same as above, but forwarding from a store +define i1 @test_store_forward_nongc_heap(i32 addrspace(1)** %p, + i32 addrspace(1)* %v) gc "statepoint-example" { +entry: + %cmp1 = call i1 @f(i32 addrspace(1)* %v) + call void @llvm.assume(i1 %cmp1) + store i32 addrspace(1)* %v, i32 addrspace(1)** %p + call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* @func, i32 0, i32 0, i32 0) + %after = load i32 addrspace(1)** %p + %cmp2 = call i1 @f(i32 addrspace(1)* %after) + ret i1 %cmp2 + +; CHECK-OPT-LABEL: test_store_forward_nongc_heap +; CHECK-OPT: ret i1 %cmp2 +; CHECK-LLC-LABEL: test_store_forward_nongc_heap +; CHECK-LLC: callq f +} + +declare void @llvm.assume(i1) +declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()*, i32, i32, ...) +declare i32 addrspace(1)* addrspace(1)* @llvm.experimental.gc.relocate.p1p1i32(i32, i32, i32) #3 + diff --git a/test/CodeGen/X86/statepoint-stack-usage.ll b/test/CodeGen/X86/statepoint-stack-usage.ll new file mode 100644 index 000000000000..fd24bf841688 --- /dev/null +++ b/test/CodeGen/X86/statepoint-stack-usage.ll @@ -0,0 +1,60 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +; This test is checking to make sure that we reuse the same stack slots +; for GC values spilled over two different call sites. Since the order +; of GC arguments differ, niave lowering code would insert loads and +; stores to rearrange items on the stack. We need to make sure (for +; performance) that this doesn't happen. +define i32 @back_to_back_calls(i32* %a, i32* %b, i32* %c) #1 { +; CHECK-LABEL: back_to_back_calls +; The exact stores don't matter, but there need to be three stack slots created +; CHECK: movq %rdx, 16(%rsp) +; CHECK: movq %rdi, 8(%rsp) +; CHECK: movq %rsi, (%rsp) + %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32* %a, i32* %b, i32* %c) + %a1 = tail call coldcc i32* @llvm.experimental.gc.relocate.p0i32(i32 %safepoint_token, i32 9, i32 9) + %b1 = tail call coldcc i32* @llvm.experimental.gc.relocate.p0i32(i32 %safepoint_token, i32 9, i32 10) + %c1 = tail call coldcc i32* @llvm.experimental.gc.relocate.p0i32(i32 %safepoint_token, i32 9, i32 11) +; CHECK: callq +; This is the key check. There should NOT be any memory moves here +; CHECK-NOT: movq + %safepoint_token2 = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32* %c1, i32* %b1, i32* %a1) + %a2 = tail call coldcc i32* @llvm.experimental.gc.relocate.p0i32(i32 %safepoint_token2, i32 9, i32 11) + %b2 = tail call coldcc i32* @llvm.experimental.gc.relocate.p0i32(i32 %safepoint_token2, i32 9, i32 10) + %c2 = tail call coldcc i32* @llvm.experimental.gc.relocate.p0i32(i32 %safepoint_token2, i32 9, i32 9) +; CHECK: callq + ret i32 1 +} + +; This test simply checks that minor changes in vm state don't prevent slots +; being reused for gc values. +define i32 @reserve_first(i32* %a, i32* %b, i32* %c) #1 { +; CHECK-LABEL: reserve_first +; The exact stores don't matter, but there need to be three stack slots created +; CHECK: movq %rdx, 16(%rsp) +; CHECK: movq %rdi, 8(%rsp) +; CHECK: movq %rsi, (%rsp) + %safepoint_token = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32* %a, i32* %b, i32* %c) + %a1 = tail call coldcc i32* @llvm.experimental.gc.relocate.p0i32(i32 %safepoint_token, i32 9, i32 9) + %b1 = tail call coldcc i32* @llvm.experimental.gc.relocate.p0i32(i32 %safepoint_token, i32 9, i32 10) + %c1 = tail call coldcc i32* @llvm.experimental.gc.relocate.p0i32(i32 %safepoint_token, i32 9, i32 11) +; CHECK: callq +; This is the key check. There should NOT be any memory moves here +; CHECK-NOT: movq + %safepoint_token2 = tail call i32 (void ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()* undef, i32 0, i32 0, i32 5, i32* %a1, i32 0, i32* %c1, i32 0, i32 0, i32* %c1, i32* %b1, i32* %a1) + %a2 = tail call coldcc i32* @llvm.experimental.gc.relocate.p0i32(i32 %safepoint_token2, i32 9, i32 11) + %b2 = tail call coldcc i32* @llvm.experimental.gc.relocate.p0i32(i32 %safepoint_token2, i32 9, i32 10) + %c2 = tail call coldcc i32* @llvm.experimental.gc.relocate.p0i32(i32 %safepoint_token2, i32 9, i32 9) +; CHECK: callq + ret i32 1 +} + +; Function Attrs: nounwind +declare i32* @llvm.experimental.gc.relocate.p0i32(i32, i32, i32) #3 + +declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(void ()*, i32, i32, ...) + +attributes #1 = { uwtable } diff --git a/test/CodeGen/X86/statepoint-stackmap-format.ll b/test/CodeGen/X86/statepoint-stackmap-format.ll new file mode 100644 index 000000000000..416674839ea8 --- /dev/null +++ b/test/CodeGen/X86/statepoint-stackmap-format.ll @@ -0,0 +1,109 @@ +; RUN: llc < %s | FileCheck %s +; This test is a sanity check to ensure statepoints are generating StackMap +; sections correctly. This is not intended to be a rigorous test of the +; StackMap format (see the stackmap tests for that). + +target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +declare zeroext i1 @return_i1() + +define i1 @test(i32 addrspace(1)* %ptr) { +; CHECK-LABEL: test +; Do we see one spill for the local value and the store to the +; alloca? +; CHECK: subq $24, %rsp +; CHECK: movq $0, 8(%rsp) +; CHECK: movq %rdi, (%rsp) +; CHECK: callq return_i1 +; CHECK: addq $24, %rsp +; CHECK: retq +entry: + %metadata1 = alloca i32 addrspace(1)*, i32 2, align 8 + store i32 addrspace(1)* null, i32 addrspace(1)** %metadata1 + %safepoint_token = tail call i32 (i1 ()*, i32, i32, ...)* @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()* @return_i1, i32 0, i32 0, i32 2, i32 addrspace(1)* %ptr, i32 addrspace(1)* null, i32 addrspace(1)* %ptr, i32 addrspace(1)* null) + %call1 = call zeroext i1 @llvm.experimental.gc.result.int.i1(i32 %safepoint_token) + %a = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 6, i32 6) + %b = call i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32 %safepoint_token, i32 7, i32 7) +; + ret i1 %call1 +} + +declare i32 @llvm.experimental.gc.statepoint.p0f_i1f(i1 ()*, i32, i32, ...) +declare i1 @llvm.experimental.gc.result.int.i1(i32) +declare i32 addrspace(1)* @llvm.experimental.gc.relocate.p1i32(i32, i32, i32) #3 + + +; CHECK-LABEL: .section .llvm_stackmaps +; CHECK-NEXT: __LLVM_StackMaps: +; Header +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .short 0 +; Num Functions +; CHECK-NEXT: .long 1 +; Num LargeConstants +; CHECK-NEXT: .long 0 +; Num Callsites +; CHECK-NEXT: .long 1 + +; Functions and stack size +; CHECK-NEXT: .quad test +; CHECK-NEXT: .quad 24 + +; Large Constants +; Statepoint ID only +; CHECK: .quad 2882400000 + +; Callsites +; Constant arguments +; CHECK: .long .Ltmp1-test +; CHECK: .short 0 +; CHECK: .short 8 +; SmallConstant (0) +; CHECK: .byte 4 +; CHECK: .byte 8 +; CHECK: .short 0 +; CHECK: .long 0 +; SmallConstant (2) +; CHECK: .byte 4 +; CHECK: .byte 8 +; CHECK: .short 0 +; CHECK: .long 2 +; Direct Spill Slot [RSP+0] +; CHECK: .byte 2 +; CHECK: .byte 8 +; CHECK: .short 7 +; CHECK: .long 0 +; SmallConstant (0) +; CHECK: .byte 4 +; CHECK: .byte 8 +; CHECK: .short 0 +; CHECK: .long 0 +; SmallConstant (0) +; CHECK: .byte 4 +; CHECK: .byte 8 +; CHECK: .short 0 +; CHECK: .long 0 +; SmallConstant (0) +; CHECK: .byte 4 +; CHECK: .byte 8 +; CHECK: .short 0 +; CHECK: .long 0 +; Direct Spill Slot [RSP+0] +; CHECK: .byte 2 +; CHECK: .byte 8 +; CHECK: .short 7 +; CHECK: .long 0 +; Direct Spill Slot [RSP+0] +; CHECK: .byte 2 +; CHECK: .byte 8 +; CHECK: .short 7 +; CHECK: .long 0 + +; No Padding or LiveOuts +; CHECK: .short 0 +; CHECK: .short 0 +; CHECK: .align 8 + + diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll index 7557f255658d..e3cc2fa668ef 100644 --- a/test/CodeGen/X86/store-narrow.ll +++ b/test/CodeGen/X86/store-narrow.ll @@ -34,8 +34,8 @@ entry: ; X64: movb %sil, 1(%rdi) ; X32-LABEL: test2: -; X32: movb 8(%esp), %[[REG:[abcd]l]] -; X32: movb %[[REG]], 1(%{{.*}}) +; X32: movb 8(%esp), %[[REG:[abcd]]]l +; X32: movb %[[REG]]l, 1(%{{.*}}) } define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp { @@ -67,8 +67,8 @@ entry: ; X64: movw %si, 2(%rdi) ; X32-LABEL: test4: -; X32: movl 8(%esp), %e[[REG:[abcd]x]] -; X32: movw %[[REG]], 2(%{{.*}}) +; X32: movw 8(%esp), %[[REG:[abcd]]]x +; X32: movw %[[REG]]x, 2(%{{.*}}) } define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp { @@ -84,8 +84,8 @@ entry: ; X64: movw %si, 2(%rdi) ; X32-LABEL: test5: -; X32: movzwl 8(%esp), %e[[REG:[abcd]x]] -; X32: movw %[[REG]], 2(%{{.*}}) +; X32: movw 8(%esp), %[[REG:[abcd]]]x +; X32: movw %[[REG]]x, 2(%{{.*}}) } define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp { diff --git a/test/CodeGen/X86/switch-default-only.ll b/test/CodeGen/X86/switch-default-only.ll new file mode 100644 index 000000000000..360ace5b787f --- /dev/null +++ b/test/CodeGen/X86/switch-default-only.ll @@ -0,0 +1,14 @@ +; RUN: llc -O0 -fast-isel=false -march=x86 < %s | FileCheck %s + +; No need for branching when the default and only destination follows +; immediately after the switch. +; CHECK-LABEL: no_branch: +; CHECK-NOT: jmp +; CHECK: ret + +define void @no_branch(i32 %x) { +entry: + switch i32 %x, label %exit [ ] +exit: + ret void +} diff --git a/test/CodeGen/X86/switch-jump-table.ll b/test/CodeGen/X86/switch-jump-table.ll new file mode 100644 index 000000000000..a84fb4aafd17 --- /dev/null +++ b/test/CodeGen/X86/switch-jump-table.ll @@ -0,0 +1,52 @@ +; RUN: llc -mtriple=i686-pc-gnu-linux < %s | FileCheck %s + + +; An unreachable default destination is replaced with the most popular case label. + +define void @sum2(i32 %x, i32* %to) { +; CHECK-LABEL: sum2: +; CHECK: movl 4(%esp), [[REG:%e[a-z]{2}]] +; CHECK: cmpl $3, [[REG]] +; CHECK: jbe .LBB0_1 +; CHECK: movl $4 +; CHECK: retl +; CHECK-LABEL: .LBB0_1: +; CHECK-NEXT: jmpl *.LJTI0_0(,[[REG]],4) + +entry: + switch i32 %x, label %default [ + i32 0, label %bb0 + i32 1, label %bb1 + i32 2, label %bb2 + i32 3, label %bb3 + i32 4, label %bb4 + i32 5, label %bb4 + ] +bb0: + store i32 0, i32* %to + br label %exit +bb1: + store i32 1, i32* %to + br label %exit +bb2: + store i32 2, i32* %to + br label %exit +bb3: + store i32 3, i32* %to + br label %exit +bb4: + store i32 4, i32* %to + br label %exit +exit: + ret void +default: + unreachable + +; The jump table has four entries. +; CHECK-LABEL: .LJTI0_0: +; CHECK-NEXT: .long .LBB0_2 +; CHECK-NEXT: .long .LBB0_3 +; CHECK-NEXT: .long .LBB0_4 +; CHECK-NEXT: .long .LBB0_5 +; CHECK-NOT: .long +} diff --git a/test/CodeGen/X86/swizzle-2.ll b/test/CodeGen/X86/swizzle-2.ll index 4b1f903c444a..697af843abb1 100644 --- a/test/CodeGen/X86/swizzle-2.ll +++ b/test/CodeGen/X86/swizzle-2.ll @@ -8,508 +8,433 @@ ; illegal shuffle that is expanded into a sub-optimal sequence of instructions ; during lowering stage. - define <4 x i32> @swizzle_1(<4 x i32> %v) { +; CHECK-LABEL: swizzle_1: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_1 -; Mask: [1,0,3,2] -; CHECK: pshufd $-79 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_2(<4 x i32> %v) { +; CHECK-LABEL: swizzle_2: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_2 -; Mask: [2,1,3,0] -; CHECK: pshufd $54 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_3(<4 x i32> %v) { +; CHECK-LABEL: swizzle_3: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_3 -; Mask: [1,0,3,2] -; CHECK: pshufd $-79 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_4(<4 x i32> %v) { +; CHECK-LABEL: swizzle_4: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_4 -; Mask: [3,1,0,2] -; CHECK: pshufd $-121 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_5(<4 x i32> %v) { +; CHECK-LABEL: swizzle_5: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_5 -; Mask: [2,3,0,1] -; CHECK: pshufd $78 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_6(<4 x i32> %v) { +; CHECK-LABEL: swizzle_6: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_6 -; Mask: [2,0,1,3] -; CHECK: pshufd $-46 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_7(<4 x i32> %v) { +; CHECK-LABEL: swizzle_7: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,1] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_7 -; Mask: [0,2,3,1] -; CHECK: pshufd $120 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_8(<4 x i32> %v) { +; CHECK-LABEL: swizzle_8: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_8 -; Mask: [1,3,2,0] -; CHECK: pshufd $45 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_9(<4 x i32> %v) { +; CHECK-LABEL: swizzle_9: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_9 -; Mask: [2,3,0,1] -; CHECK: pshufd $78 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_10(<4 x i32> %v) { +; CHECK-LABEL: swizzle_10: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,0,3] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_10 -; Mask: [1,2,0,3] -; CHECK: pshufd $-55 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_11(<4 x i32> %v) { +; CHECK-LABEL: swizzle_11: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_11 -; Mask: [3,2,1,0] -; CHECK: pshufd $27 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_12(<4 x i32> %v) { +; CHECK-LABEL: swizzle_12: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_12 -; Mask: [0,3,1,2] -; CHECK: pshufd $-100 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_13(<4 x i32> %v) { +; CHECK-LABEL: swizzle_13: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_13 -; Mask: [3,2,1,0] -; CHECK: pshufd $27 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x i32> @swizzle_14(<4 x i32> %v) { +; CHECK-LABEL: swizzle_14: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,2,1] +; CHECK-NEXT: retq %1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0> ret <4 x i32> %2 } -; CHECK-LABEL: swizzle_14 -; Mask: [3,0,2,1] -; CHECK: pshufd $99 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_15(<4 x float> %v) { +; CHECK-LABEL: swizzle_15: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,3,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_15 -; Mask: [1,0,3,2] -; CHECK: pshufd $-79 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_16(<4 x float> %v) { +; CHECK-LABEL: swizzle_16: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_16 -; Mask: [2,1,3,0] -; CHECK: pshufd $54 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_17(<4 x float> %v) { +; CHECK-LABEL: swizzle_17: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,3,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_17 -; Mask: [1,0,3,2] -; CHECK: pshufd $-79 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_18(<4 x float> %v) { +; CHECK-LABEL: swizzle_18: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,0,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_18 -; Mask: [3,1,0,2] -; CHECK: pshufd $-121 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_19(<4 x float> %v) { +; CHECK-LABEL: swizzle_19: +; CHECK: # BB#0: +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_19 -; Mask: [2,3,0,1] -; CHECK: pshufd $78 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_20(<4 x float> %v) { +; CHECK-LABEL: swizzle_20: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_20 -; Mask: [2,0,1,3] -; CHECK: pshufd $-46 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_21(<4 x float> %v) { +; CHECK-LABEL: swizzle_21: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_21 -; Mask: [0,2,3,1] -; CHECK: pshufd $120 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_22(<4 x float> %v) { +; CHECK-LABEL: swizzle_22: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_22 -; Mask: [1,3,2,0] -; CHECK: pshufd $45 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_23(<4 x float> %v) { +; CHECK-LABEL: swizzle_23: +; CHECK: # BB#0: +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_23 -; Mask: [2,3,0,1] -; CHECK: pshufd $78 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_24(<4 x float> %v) { +; CHECK-LABEL: swizzle_24: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2,0,3] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_24 -; Mask: [1,2,0,3] -; CHECK: pshufd $-55 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_25(<4 x float> %v) { +; CHECK-LABEL: swizzle_25: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_25 -; Mask: [3,2,1,0] -; CHECK: pshufd $27 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_26(<4 x float> %v) { +; CHECK-LABEL: swizzle_26: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,1,2] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_26 -; Mask: [0,3,1,2] -; CHECK: pshufd $-100 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_27(<4 x float> %v) { +; CHECK-LABEL: swizzle_27: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_27 -; Mask: [3,2,1,0] -; CHECK: pshufd $27 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_28(<4 x float> %v) { +; CHECK-LABEL: swizzle_28: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,2,1] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_28 -; Mask: [3,0,2,1] -; CHECK: pshufd $99 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret - define <4 x float> @swizzle_29(<4 x float> %v) { +; CHECK-LABEL: swizzle_29: +; CHECK: # BB#0: +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,0] +; CHECK-NEXT: retq %1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0> %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3> ret <4 x float> %2 } -; CHECK-LABEL: swizzle_29 -; Mask: [1,3,2,0] -; CHECK: pshufd $45 -; CHECK-NOT: pshufd -; CHECK-NEXT: ret ; Make sure that we combine the shuffles from each function below into a single ; legal shuffle (either pshuflw or pshufb depending on the masks). define <8 x i16> @swizzle_30(<8 x i16> %v) { +; CHECK-LABEL: swizzle_30: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 7, i32 5, i32 6, i32 4> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_30 -; Mask: [1,3,2,0,5,7,6,4] -; CHECK: pshuflw $45 -; CHECK-NOT: pshufb -; CHECK-NEXT: ret - define <8 x i16> @swizzle_31(<8 x i16> %v) { +; CHECK-LABEL: swizzle_31: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 5, i32 6, i32 4> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 5, i32 6, i32 4> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_31 -; Mask: [1,3,2,0,4,5,6,7] -; CHECK: pshuflw $45 -; CHECK-NOT: pshufb -; CHECK: ret - define <8 x i16> @swizzle_32(<8 x i16> %v) { +; CHECK-LABEL: swizzle_32: +; CHECK: # BB#0: +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 7, i32 5, i32 6, i32 4> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 7, i32 5, i32 6, i32 4> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_32 -; Mask: [2,3,0,1,4,5,6,7] --> equivalent to pshufd mask [1,0,2,3] -; CHECK: pshufd $-31 -; CHECK-NOT: pshufb -; CHECK: ret define <8 x i16> @swizzle_33(<8 x i16> %v) { +; CHECK-LABEL: swizzle_33: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 5, i32 7, i32 2, i32 3, i32 1, i32 0> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 5, i32 7, i32 2, i32 3, i32 1, i32 0> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_33 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_34(<8 x i16> %v) { +; CHECK-LABEL: swizzle_34: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,0,2,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 7, i32 6, i32 5, i32 1, i32 2, i32 0, i32 3> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 7, i32 6, i32 5, i32 1, i32 2, i32 0, i32 3> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_34 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_35(<8 x i16> %v) { +; CHECK-LABEL: swizzle_35: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 1, i32 3, i32 0, i32 2> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 1, i32 3, i32 0, i32 2> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_35 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK: ret - define <8 x i16> @swizzle_36(<8 x i16> %v) { +; CHECK-LABEL: swizzle_36: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 7, i32 5, i32 0, i32 1, i32 3, i32 2> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 6, i32 7, i32 5, i32 0, i32 1, i32 3, i32 2> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_36 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_37(<8 x i16> %v) { +; CHECK-LABEL: swizzle_37: +; CHECK: # BB#0: +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 7, i32 5, i32 6, i32 4> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 7, i32 4, i32 6, i32 5> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_37 -; Mask: [0,1,2,3,4,7,6,5] -; CHECK: pshufhw $108 -; CHECK-NOT: pshufb -; CHECK: ret - define <8 x i16> @swizzle_38(<8 x i16> %v) { +; CHECK-LABEL: swizzle_38: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 5, i32 6, i32 4, i32 7, i32 0, i32 2, i32 1, i32 3> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 5, i32 6, i32 4, i32 7, i32 0, i32 2, i32 1, i32 3> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_38 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_39(<8 x i16> %v) { +; CHECK-LABEL: swizzle_39: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,3,1,0,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 3, i32 2, i32 1, i32 0> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 3, i32 2, i32 1, i32 0> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_39 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_40(<8 x i16> %v) { +; CHECK-LABEL: swizzle_40: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 6, i32 4, i32 7, i32 5, i32 1, i32 0, i32 3, i32 2> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 6, i32 4, i32 7, i32 5, i32 1, i32 0, i32 3, i32 2> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_40 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_41(<8 x i16> %v) { +; CHECK-LABEL: swizzle_41: +; CHECK: # BB#0: +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 5, i32 4, i32 0, i32 1, i32 3, i32 2> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 6, i32 7, i32 5, i32 4, i32 0, i32 1, i32 3, i32 2> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_41 -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK-NOT: shufpd -; CHECK: ret - define <8 x i16> @swizzle_42(<8 x i16> %v) { +; CHECK-LABEL: swizzle_42: +; CHECK: # BB#0: +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] +; CHECK-NEXT: retq %1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 7, i32 6, i32 4, i32 5> %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 7, i32 6, i32 4, i32 5> ret <8 x i16> %2 } -; CHECK-LABEL: swizzle_42 -; Mask: [0,1,2,3,5,4,7,6] -; CHECK: pshufhw $-79 -; CHECK-NOT: pshufb -; CHECK: ret - - diff --git a/test/CodeGen/X86/swizzle.ll b/test/CodeGen/X86/swizzle.ll deleted file mode 100644 index 23e0c2453d64..000000000000 --- a/test/CodeGen/X86/swizzle.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movlps -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movsd -; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep movups -; rdar://6523650 - - %struct.vector4_t = type { <4 x float> } - -define void @swizzle(i8* nocapture %a, %struct.vector4_t* nocapture %b, %struct.vector4_t* nocapture %c) nounwind { -entry: - %0 = getelementptr %struct.vector4_t* %b, i32 0, i32 0 ; <<4 x float>*> [#uses=2] - %1 = load <4 x float>* %0, align 4 ; <<4 x float>> [#uses=1] - %tmp.i = bitcast i8* %a to double* ; <double*> [#uses=1] - %tmp1.i = load double* %tmp.i ; <double> [#uses=1] - %2 = insertelement <2 x double> undef, double %tmp1.i, i32 0 ; <<2 x double>> [#uses=1] - %tmp2.i = bitcast <2 x double> %2 to <4 x float> ; <<4 x float>> [#uses=1] - %3 = shufflevector <4 x float> %1, <4 x float> %tmp2.i, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x float>> [#uses=1] - store <4 x float> %3, <4 x float>* %0, align 4 - ret void -} diff --git a/test/CodeGen/X86/tailcall-multiret.ll b/test/CodeGen/X86/tailcall-multiret.ll new file mode 100644 index 000000000000..a77a59cd70bc --- /dev/null +++ b/test/CodeGen/X86/tailcall-multiret.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mcpu=core2 | FileCheck %s +; See PR19530 +declare double @llvm.powi.f64(double %Val, i32 %power) +define <3 x double> @julia_foo17589(i32 %arg) { + %tmp1 = call double @llvm.powi.f64(double 1.000000e+00, i32 %arg) +; CHECK: callq __powidf2 + %tmp2 = insertelement <3 x double> undef, double %tmp1, i32 0 + %tmp3 = call double @llvm.powi.f64(double 2.000000e+00, i32 %arg) +; CHECK: callq __powidf2 + %tmp4 = insertelement <3 x double> %tmp2, double %tmp3, i32 1 + %tmp5 = call double @llvm.powi.f64(double 3.000000e+00, i32 %arg) +; CHECK: callq __powidf2 + %tmp6 = insertelement <3 x double> %tmp4, double %tmp5, i32 2 +; CHECK-NOT: TAILCALL + ret <3 x double> %tmp6 +} diff --git a/test/CodeGen/X86/tailcall-returndup-void.ll b/test/CodeGen/X86/tailcall-returndup-void.ll index c1d631225ec7..2c39cb4468df 100644 --- a/test/CodeGen/X86/tailcall-returndup-void.ll +++ b/test/CodeGen/X86/tailcall-returndup-void.ll @@ -3,9 +3,9 @@ ; CHECK-NOT: ret @sES_closure = external global [0 x i64] -declare cc10 void @sEH_info(i64* noalias nocapture, i64* noalias nocapture, i64* noalias nocapture, i64, i64, i64) align 8 +declare ghccc void @sEH_info(i64* noalias nocapture, i64* noalias nocapture, i64* noalias nocapture, i64, i64, i64) align 8 -define cc10 void @rBM_info(i64* noalias nocapture %Base_Arg, i64* noalias nocapture %Sp_Arg, i64* noalias nocapture %Hp_Arg, i64 %R1_Arg, i64 %R2_Arg, i64 %R3_Arg) nounwind align 8 { +define ghccc void @rBM_info(i64* noalias nocapture %Base_Arg, i64* noalias nocapture %Sp_Arg, i64* noalias nocapture %Hp_Arg, i64 %R1_Arg, i64 %R2_Arg, i64 %R3_Arg) nounwind align 8 { c263: %ln265 = getelementptr inbounds i64* %Sp_Arg, i64 -2 %ln266 = ptrtoint i64* %ln265 to i64 @@ -18,11 +18,11 @@ n26p: ; preds = %c263 n1ZQ.i: ; preds = %n26p %ln1ZT.i = load i64* getelementptr inbounds ([0 x i64]* @sES_closure, i64 0, i64 0), align 8 %ln1ZU.i = inttoptr i64 %ln1ZT.i to void (i64*, i64*, i64*, i64, i64, i64)* - tail call cc10 void %ln1ZU.i(i64* %Base_Arg, i64* %Sp_Arg, i64* %Hp_Arg, i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 %R3_Arg) nounwind + tail call ghccc void %ln1ZU.i(i64* %Base_Arg, i64* %Sp_Arg, i64* %Hp_Arg, i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 %R3_Arg) nounwind br label %rBL_info.exit c1ZP.i: ; preds = %n26p - tail call cc10 void @sEH_info(i64* %Base_Arg, i64* %Sp_Arg, i64* %Hp_Arg, i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 %R3_Arg) nounwind + tail call ghccc void @sEH_info(i64* %Base_Arg, i64* %Sp_Arg, i64* %Hp_Arg, i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 ptrtoint ([0 x i64]* @sES_closure to i64), i64 %R3_Arg) nounwind br label %rBL_info.exit rBL_info.exit: ; preds = %c1ZP.i, %n1ZQ.i @@ -32,6 +32,6 @@ c26a: ; preds = %c263 %ln27h = getelementptr inbounds i64* %Base_Arg, i64 -2 %ln27j = load i64* %ln27h, align 8 %ln27k = inttoptr i64 %ln27j to void (i64*, i64*, i64*, i64, i64, i64)* - tail call cc10 void %ln27k(i64* %Base_Arg, i64* %Sp_Arg, i64* %Hp_Arg, i64 %R1_Arg, i64 %R2_Arg, i64 %R3_Arg) nounwind + tail call ghccc void %ln27k(i64* %Base_Arg, i64* %Sp_Arg, i64* %Hp_Arg, i64 %R1_Arg, i64 %R2_Arg, i64 %R3_Arg) nounwind ret void } diff --git a/test/CodeGen/X86/tls-addr-non-leaf-function.ll b/test/CodeGen/X86/tls-addr-non-leaf-function.ll new file mode 100644 index 000000000000..ec47232059f8 --- /dev/null +++ b/test/CodeGen/X86/tls-addr-non-leaf-function.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -relocation-model=pic -O2 -disable-fp-elim -o - | FileCheck %s +; RUN: llc < %s -relocation-model=pic -O2 -o - | FileCheck %s + +; This test runs twice with different options regarding the frame pointer: +; first the elimination is disabled, then it is enabled. The disabled case is +; the "control group". +; The function 'foo' below is marked with the "no-frame-pointer-elim-non-leaf" +; attribute which dictates that the frame pointer should not be eliminated +; unless the function is a leaf (i.e. it doesn't call any other function). +; Now, 'foo' is not a leaf function, because it performs a TLS access which on +; X86 ELF in PIC mode is expanded as a library call. +; This call is represented with a pseudo-instruction which doesn't appear to be +; a call when inspected by the analysis passes (it doesn't have the "isCall" +; flag), and the ISel lowering code creating the pseudo was not informing the +; MachineFrameInfo that the function contained calls. This affected the decision +; whether to eliminate the frame pointer. +; With the fix, the "hasCalls" flag is set in the MFI for the function whenever +; a TLS access pseudo-instruction is created, so 'foo' appears to be a non-leaf +; function, and the difference in the options does not affect codegen: both +; versions will have a frame pointer. + +; Test that there's some frame pointer usage in 'foo'... +; CHECK: foo: +; CHECK: pushq %rbp +; CHECK: movq %rsp, %rbp +; ... and the TLS library call is also present. +; CHECK: leaq x@TLSGD(%rip), %rdi +; CHECK: callq __tls_get_addr@PLT + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@x = thread_local global i32 0 +define i32 @foo() "no-frame-pointer-elim-non-leaf" { + %a = load i32* @x, align 4 + ret i32 %a +} diff --git a/test/CodeGen/X86/tls-models.ll b/test/CodeGen/X86/tls-models.ll index 8e3e95886ad8..0fd785328211 100644 --- a/test/CodeGen/X86/tls-models.ll +++ b/test/CodeGen/X86/tls-models.ll @@ -128,6 +128,14 @@ entry: ; DARWIN: _internal_ie@TLVP } +define i32 @PR22083() { +entry: + ret i32 ptrtoint (i32* @external_ie to i32) + ; X64-LABEL: PR22083: + ; X64: movq external_ie@GOTTPOFF(%rip), %rax + ; X64_PIC-LABEL: PR22083: + ; X64_PIC: movq external_ie@GOTTPOFF(%rip), %rax +} ; ----- localexec specified ----- diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll index d230f1f7e2c6..8de6297906c7 100644 --- a/test/CodeGen/X86/trunc-ext-ld-st.ll +++ b/test/CodeGen/X86/trunc-ext-ld-st.ll @@ -20,7 +20,7 @@ define void @load_2_i8(<2 x i8>* %A) { ; Read 32-bits ;CHECK: pmovzxwq ;CHECK: paddq -;CHECK: pshufb +;CHECK: pshufd ;CHECK: movd ;CHECK: ret define void @load_2_i16(<2 x i16>* %A) { @@ -32,7 +32,7 @@ define void @load_2_i16(<2 x i16>* %A) { ;CHECK-LABEL: load_2_i32: ;CHECK: pmovzxdq -;CHECK: paddq +;CHECK: paddd ;CHECK: pshufd ;CHECK: ret define void @load_2_i32(<2 x i32>* %A) { @@ -56,7 +56,7 @@ define void @load_4_i8(<4 x i8>* %A) { ;CHECK-LABEL: load_4_i16: ;CHECK: pmovzxwd -;CHECK: paddd +;CHECK: paddw ;CHECK: pshufb ;CHECK: ret define void @load_4_i16(<4 x i16>* %A) { @@ -68,7 +68,7 @@ define void @load_4_i16(<4 x i16>* %A) { ;CHECK-LABEL: load_8_i8: ;CHECK: pmovzxbw -;CHECK: paddw +;CHECK: paddb ;CHECK: pshufb ;CHECK: ret define void @load_8_i8(<8 x i8>* %A) { diff --git a/test/CodeGen/X86/uint_to_fp-2.ll b/test/CodeGen/X86/uint_to_fp-2.ll index c5a61c3779bf..e47f15453ed4 100644 --- a/test/CodeGen/X86/uint_to_fp-2.ll +++ b/test/CodeGen/X86/uint_to_fp-2.ll @@ -1,15 +1,20 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown-unknown -march=x86 -mattr=+sse2 | FileCheck %s ; rdar://6504833 define float @test1(i32 %x) nounwind readnone { -; CHECK: test1 -; CHECK: movd -; CHECK: orps -; CHECK: subsd -; CHECK: cvtsd2ss -; CHECK: movss -; CHECK: flds -; CHECK: ret +; CHECK-LABEL: test1: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movsd .LCPI0_0, %xmm0 +; CHECK-NEXT: movd {{[0-9]+}}(%esp), %xmm1 +; CHECK-NEXT: orps %xmm0, %xmm1 +; CHECK-NEXT: subsd %xmm0, %xmm1 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsd2ss %xmm1, %xmm0 +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax +; CHECK-NEXT: retl entry: %0 = uitofp i32 %x to float ret float %0 @@ -17,15 +22,20 @@ entry: ; PR10802 define float @test2(<4 x i32> %x) nounwind readnone ssp { -; CHECK: test2 -; CHECK: xorps [[ZERO:%xmm[0-9]+]] -; CHECK: movss {{.*}}, [[ZERO]] -; CHECK: orps -; CHECK: subsd -; CHECK: cvtsd2ss -; CHECK: movss -; CHECK: flds -; CHECK: ret +; CHECK-LABEL: test2: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movss %xmm0, %xmm1 +; CHECK-NEXT: movsd .LCPI1_0, %xmm0 +; CHECK-NEXT: orps %xmm0, %xmm1 +; CHECK-NEXT: subsd %xmm0, %xmm1 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsd2ss %xmm1, %xmm0 +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax +; CHECK-NEXT: retl entry: %vecext = extractelement <4 x i32> %x, i32 0 %conv = uitofp i32 %vecext to float diff --git a/test/CodeGen/X86/unaligned-32-byte-memops.ll b/test/CodeGen/X86/unaligned-32-byte-memops.ll new file mode 100644 index 000000000000..347f330d67ae --- /dev/null +++ b/test/CodeGen/X86/unaligned-32-byte-memops.ll @@ -0,0 +1,279 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2 --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL --check-prefix=CHECK + +; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load +; because that is slower than two 16-byte loads. +; Other AVX-capable chips don't have that problem. + +define <8 x float> @load32bytes(<8 x float>* %Ap) { + ; CHECK-LABEL: load32bytes + + ; SANDYB: vmovaps + ; SANDYB: vinsertf128 + ; SANDYB: retq + + ; BTVER2: vmovups + ; BTVER2: retq + + ; HASWELL: vmovups + ; HASWELL: retq + + %A = load <8 x float>* %Ap, align 16 + ret <8 x float> %A +} + +; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store +; because that is slowerthan two 16-byte stores. +; Other AVX-capable chips don't have that problem. + +define void @store32bytes(<8 x float> %A, <8 x float>* %P) { + ; CHECK-LABEL: store32bytes + + ; SANDYB: vextractf128 + ; SANDYB: vmovaps + ; SANDYB: retq + + ; BTVER2: vmovups + ; BTVER2: retq + + ; HASWELL: vmovups + ; HASWELL: retq + + store <8 x float> %A, <8 x float>* %P, align 16 + ret void +} + +; Merge two consecutive 16-byte subvector loads into a single 32-byte load +; if it's faster. + +declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) + +; Use the vinsertf128 intrinsic to model source code +; that explicitly uses AVX intrinsics. +define <8 x float> @combine_16_byte_loads(<4 x float>* %ptr) { + ; CHECK-LABEL: combine_16_byte_loads + + ; SANDYB: vmovups + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vmovups + ; BTVER2-NEXT: retq + + ; HASWELL: vmovups + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 + %v1 = load <4 x float>* %ptr, align 1 + %v2 = load <4 x float>* %ptr2, align 1 + %shuffle = shufflevector <4 x float> %v1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> + %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v2, i8 1) + ret <8 x float> %v3 +} + +; Swap the operands of the shufflevector and vinsertf128 to ensure that the +; pattern still matches. +define <8 x float> @combine_16_byte_loads_swap(<4 x float>* %ptr) { + ; CHECK-LABEL: combine_16_byte_loads_swap + + ; SANDYB: vmovups + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vmovups + ; BTVER2-NEXT: retq + + ; HASWELL: vmovups + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 + %v1 = load <4 x float>* %ptr, align 1 + %v2 = load <4 x float>* %ptr2, align 1 + %shuffle = shufflevector <4 x float> %v2, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3> + %v3 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %shuffle, <4 x float> %v1, i8 0) + ret <8 x float> %v3 +} + +; Replace the vinsertf128 intrinsic with a shufflevector as might be +; expected from auto-vectorized code. +define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) { + ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic + + ; SANDYB: vmovups + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vmovups + ; BTVER2-NEXT: retq + + ; HASWELL: vmovups + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 + %v1 = load <4 x float>* %ptr, align 1 + %v2 = load <4 x float>* %ptr2, align 1 + %v3 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %v3 +} + +; Swap the order of the shufflevector operands to ensure that the +; pattern still matches. +define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) { + ; CHECK-LABEL: combine_16_byte_loads_no_intrinsic_swap + + ; SANDYB: vmovups + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vmovups + ; BTVER2-NEXT: retq + + ; HASWELL: vmovups + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <4 x float>* %ptr, i64 1 + %v1 = load <4 x float>* %ptr, align 1 + %v2 = load <4 x float>* %ptr2, align 1 + %v3 = shufflevector <4 x float> %v2, <4 x float> %v1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> + ret <8 x float> %v3 +} + +; Check each element type other than float to make sure it is handled correctly. +; Use the loaded values with an 'add' to make sure we're using the correct load type. +; Even though BtVer2 has fast 32-byte loads, we should not generate those for +; 256-bit integer vectors because BtVer2 doesn't have AVX2. + +define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) { + ; CHECK-LABEL: combine_16_byte_loads_i64 + + ; SANDYB: vextractf128 + ; SANDYB-NEXT: vpaddq + ; SANDYB-NEXT: vpaddq + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vextractf128 + ; BTVER2-NEXT: vpaddq + ; BTVER2-NEXT: vpaddq + ; BTVER2-NEXT: vinsertf128 + ; BTVER2-NEXT: retq + + ; HASWELL: vmovdqu + ; HASWELL-NEXT: vpaddq + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <2 x i64>* %ptr, i64 1 + %v1 = load <2 x i64>* %ptr, align 1 + %v2 = load <2 x i64>* %ptr2, align 1 + %v3 = shufflevector <2 x i64> %v1, <2 x i64> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %v4 = add <4 x i64> %v3, %x + ret <4 x i64> %v4 +} + +define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) { + ; CHECK-LABEL: combine_16_byte_loads_i32 + + ; SANDYB: vextractf128 + ; SANDYB-NEXT: vpaddd + ; SANDYB-NEXT: vpaddd + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vextractf128 + ; BTVER2-NEXT: vpaddd + ; BTVER2-NEXT: vpaddd + ; BTVER2-NEXT: vinsertf128 + ; BTVER2-NEXT: retq + + ; HASWELL: vmovdqu + ; HASWELL-NEXT: vpaddd + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <4 x i32>* %ptr, i64 1 + %v1 = load <4 x i32>* %ptr, align 1 + %v2 = load <4 x i32>* %ptr2, align 1 + %v3 = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + %v4 = add <8 x i32> %v3, %x + ret <8 x i32> %v4 +} + +define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) { + ; CHECK-LABEL: combine_16_byte_loads_i16 + + ; SANDYB: vextractf128 + ; SANDYB-NEXT: vpaddw + ; SANDYB-NEXT: vpaddw + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vextractf128 + ; BTVER2-NEXT: vpaddw + ; BTVER2-NEXT: vpaddw + ; BTVER2-NEXT: vinsertf128 + ; BTVER2-NEXT: retq + + ; HASWELL: vmovdqu + ; HASWELL-NEXT: vpaddw + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <8 x i16>* %ptr, i64 1 + %v1 = load <8 x i16>* %ptr, align 1 + %v2 = load <8 x i16>* %ptr2, align 1 + %v3 = shufflevector <8 x i16> %v1, <8 x i16> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + %v4 = add <16 x i16> %v3, %x + ret <16 x i16> %v4 +} + +define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) { + ; CHECK-LABEL: combine_16_byte_loads_i8 + + ; SANDYB: vextractf128 + ; SANDYB-NEXT: vpaddb + ; SANDYB-NEXT: vpaddb + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: retq + + ; BTVER2: vextractf128 + ; BTVER2-NEXT: vpaddb + ; BTVER2-NEXT: vpaddb + ; BTVER2-NEXT: vinsertf128 + ; BTVER2-NEXT: retq + + ; HASWELL: vmovdqu + ; HASWELL-NEXT: vpaddb + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <16 x i8>* %ptr, i64 1 + %v1 = load <16 x i8>* %ptr, align 1 + %v2 = load <16 x i8>* %ptr2, align 1 + %v3 = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + %v4 = add <32 x i8> %v3, %x + ret <32 x i8> %v4 +} + +define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) { + ; CHECK-LABEL: combine_16_byte_loads_double + + ; SANDYB: vmovupd + ; SANDYB-NEXT: vinsertf128 + ; SANDYB-NEXT: vaddpd + ; SANDYB-NEXT: retq + + ; BTVER2: vmovupd + ; BTVER2-NEXT: vaddpd + ; BTVER2-NEXT: retq + + ; HASWELL: vmovupd + ; HASWELL: vaddpd + ; HASWELL-NEXT: retq + + %ptr2 = getelementptr inbounds <2 x double>* %ptr, i64 1 + %v1 = load <2 x double>* %ptr, align 1 + %v2 = load <2 x double>* %ptr2, align 1 + %v3 = shufflevector <2 x double> %v1, <2 x double> %v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %v4 = fadd <4 x double> %v3, %x + ret <4 x double> %v4 +} + diff --git a/test/CodeGen/X86/unknown-location.ll b/test/CodeGen/X86/unknown-location.ll index d7ae46939035..140121ba3035 100644 --- a/test/CodeGen/X86/unknown-location.ll +++ b/test/CodeGen/X86/unknown-location.ll @@ -21,16 +21,16 @@ entry: !llvm.dbg.cu = !{!3} !llvm.module.flags = !{!12} -!0 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !2, i32 1, metadata !6} ; [ DW_TAG_arg_variable ] -!1 = metadata !{i32 786478, metadata !10, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 1, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, i32, i32, i32)* @foo, null, null, null, i32 1} ; [ DW_TAG_subprogram ] -!2 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ] -!3 = metadata !{i32 786449, metadata !10, i32 12, metadata !"producer", i1 false, metadata !"", i32 0, metadata !11, metadata !11, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ] -!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] -!5 = metadata !{metadata !6} -!6 = metadata !{i32 786468, metadata !10, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] -!7 = metadata !{i32 786443, metadata !2, metadata !1, i32 1, i32 30, i32 0} ; [ DW_TAG_lexical_block ] -!8 = metadata !{i32 4, i32 3, metadata !7, null} -!9 = metadata !{metadata !1} -!10 = metadata !{metadata !"test.c", metadata !"/dir"} -!11 = metadata !{i32 0} -!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1} +!0 = !{!"0x101\00x\001\000", !1, !2, !6} ; [ DW_TAG_arg_variable ] +!1 = !{!"0x2e\00foo\00foo\00foo\001\000\001\000\006\000\000\001", !10, !2, !4, null, i32 (i32, i32, i32, i32)* @foo, null, null, null} ; [ DW_TAG_subprogram ] +!2 = !{!"0x29", !10} ; [ DW_TAG_file_type ] +!3 = !{!"0x11\0012\00producer\000\00\000\00\000", !10, !11, !11, !9, null, null} ; [ DW_TAG_compile_unit ] +!4 = !{!"0x15\00\000\000\000\000\000\000", !10, !2, null, !5, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ] +!5 = !{!6} +!6 = !{!"0x24\00int\000\0032\0032\000\000\005", !10, !2} ; [ DW_TAG_base_type ] +!7 = !{!"0xb\001\0030\000", !2, !1} ; [ DW_TAG_lexical_block ] +!8 = !MDLocation(line: 4, column: 3, scope: !7) +!9 = !{!1} +!10 = !{!"test.c", !"/dir"} +!11 = !{i32 0} +!12 = !{i32 1, !"Debug Info Version", i32 2} diff --git a/test/CodeGen/X86/utf16-cfstrings.ll b/test/CodeGen/X86/utf16-cfstrings.ll index af76a333e8a6..c7ec3eb7abce 100644 --- a/test/CodeGen/X86/utf16-cfstrings.ll +++ b/test/CodeGen/X86/utf16-cfstrings.ll @@ -29,7 +29,7 @@ declare void @NSLog(%0*, ...) !llvm.module.flags = !{!0, !1, !2, !3} -!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2} -!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0} -!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"} -!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0} +!0 = !{i32 1, !"Objective-C Version", i32 2} +!1 = !{i32 1, !"Objective-C Image Info Version", i32 0} +!2 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"} +!3 = !{i32 4, !"Objective-C Garbage Collection", i32 0} diff --git a/test/CodeGen/X86/v-binop-widen.ll b/test/CodeGen/X86/v-binop-widen.ll deleted file mode 100644 index fca4da66a85e..000000000000 --- a/test/CodeGen/X86/v-binop-widen.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc -mcpu=generic -march=x86 -mattr=+sse < %s | FileCheck %s -; CHECK: divps -; CHECK: divps -; CHECK: divss - -%vec = type <9 x float> -define %vec @vecdiv( %vec %p1, %vec %p2) -{ - %result = fdiv %vec %p1, %p2 - ret %vec %result -} diff --git a/test/CodeGen/X86/v-binop-widen2.ll b/test/CodeGen/X86/v-binop-widen2.ll deleted file mode 100644 index 334211132f14..000000000000 --- a/test/CodeGen/X86/v-binop-widen2.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc -march=x86 -mcpu=generic -mattr=+sse < %s | FileCheck %s -; RUN: llc -march=x86 -mcpu=atom -mattr=+sse < %s | FileCheck -check-prefix=ATOM %s - -%vec = type <6 x float> -; CHECK: divps -; CHECK: divss -; CHECK: divss - -; Scheduler causes a different instruction order to be produced on Intel Atom -; ATOM: divps -; ATOM: divss -; ATOM: divss - -define %vec @vecdiv( %vec %p1, %vec %p2) -{ - %result = fdiv %vec %p1, %p2 - ret %vec %result -} - -@a = constant %vec < float 2.0, float 4.0, float 8.0, float 16.0, float 32.0, float 64.0 > -@b = constant %vec < float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0 > - -; Expected result: < 1.0, 2.0, 4.0, ..., 2.0^(n-1) > -; main() returns 0 if the result is expected and 1 otherwise -; to execute, use llvm-as < %s | lli -define i32 @main() nounwind { -entry: - %avec = load %vec* @a - %bvec = load %vec* @b - - %res = call %vec @vecdiv(%vec %avec, %vec %bvec) - br label %loop -loop: - %idx = phi i32 [0, %entry], [%nextInd, %looptail] - %expected = phi float [1.0, %entry], [%nextExpected, %looptail] - %elem = extractelement %vec %res, i32 %idx - %expcmp = fcmp oeq float %elem, %expected - br i1 %expcmp, label %looptail, label %return -looptail: - %nextExpected = fmul float %expected, 2.0 - %nextInd = add i32 %idx, 1 - %cmp = icmp slt i32 %nextInd, 6 - br i1 %cmp, label %loop, label %return -return: - %retval = phi i32 [0, %looptail], [1, %loop] - ret i32 %retval -} diff --git a/test/CodeGen/X86/v2f32.ll b/test/CodeGen/X86/v2f32.ll index dab5e7bc944c..b9bd80f949ec 100644 --- a/test/CodeGen/X86/v2f32.ll +++ b/test/CodeGen/X86/v2f32.ll @@ -1,115 +1,94 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -asm-verbose=0 -o - | FileCheck %s -check-prefix=X64 -; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -asm-verbose=0 -o - | FileCheck %s -check-prefix=W64 -; RUN: llc < %s -mcpu=yonah -march=x86 -mtriple=i386-linux-gnu -asm-verbose=0 -o - | FileCheck %s -check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -o - | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mcpu=yonah -march=x86 -mtriple=i386-linux-gnu -o - | FileCheck %s --check-prefix=X32 ; PR7518 define void @test1(<2 x float> %Q, float *%P2) nounwind { +; X64-LABEL: test1: +; X64: # BB#0: +; X64-NEXT: movaps %xmm0, %xmm1 +; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X64-NEXT: addss %xmm0, %xmm1 +; X64-NEXT: movss %xmm1, (%rdi) +; X64-NEXT: retq +; +; X32-LABEL: test1: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movaps %xmm0, %xmm1 +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X32-NEXT: addss %xmm0, %xmm1 +; X32-NEXT: movss %xmm1, (%eax) +; X32-NEXT: retl %a = extractelement <2 x float> %Q, i32 0 %b = extractelement <2 x float> %Q, i32 1 %c = fadd float %a, %b - store float %c, float* %P2 ret void -; X64-LABEL: test1: -; X64-NEXT: pshufd $1, %xmm0, %xmm1 -; X64-NEXT: addss %xmm0, %xmm1 -; X64-NEXT: movss %xmm1, (%rdi) -; X64-NEXT: ret - -; W64-LABEL: test1: -; W64-NEXT: movdqa (%rcx), %xmm0 -; W64-NEXT: pshufd $1, %xmm0, %xmm1 -; W64-NEXT: addss %xmm0, %xmm1 -; W64-NEXT: movss %xmm1, (%rdx) -; W64-NEXT: ret - -; X32-LABEL: test1: -; X32-NEXT: movl 4(%esp), %eax -; X32-NEXT: pshufd $1, %xmm0, %xmm1 -; X32-NEXT: addss %xmm0, %xmm1 -; X32-NEXT: movss %xmm1, (%eax) -; X32-NEXT: ret } - define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, <2 x float> *%P) nounwind { - %Z = fadd <2 x float> %Q, %R - ret <2 x float> %Z - ; X64-LABEL: test2: -; X64-NEXT: addps %xmm1, %xmm0 -; X64-NEXT: ret - -; W64-LABEL: test2: -; W64-NEXT: movaps (%rcx), %xmm0 -; W64-NEXT: addps (%rdx), %xmm0 -; W64-NEXT: ret - +; X64: # BB#0: +; X64-NEXT: addps %xmm1, %xmm0 +; X64-NEXT: retq +; ; X32-LABEL: test2: -; X32: addps %xmm1, %xmm0 +; X32: # BB#0: +; X32-NEXT: addps %xmm1, %xmm0 +; X32-NEXT: retl + %Z = fadd <2 x float> %Q, %R + ret <2 x float> %Z } - define <2 x float> @test3(<4 x float> %A) nounwind { +; X64-LABEL: test3: +; X64: # BB#0: +; X64-NEXT: addps %xmm0, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test3: +; X32: # BB#0: +; X32-NEXT: addps %xmm0, %xmm0 +; X32-NEXT: retl %B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1> %C = fadd <2 x float> %B, %B ret <2 x float> %C -; X64-LABEL: test3: -; X64-NEXT: addps %xmm0, %xmm0 -; X64-NEXT: ret - -; W64-LABEL: test3: -; W64-NEXT: movaps (%rcx), %xmm0 -; W64-NEXT: addps %xmm0, %xmm0 -; W64-NEXT: ret - -; X32-LABEL: test3: -; X32-NEXT: addps %xmm0, %xmm0 -; X32-NEXT: ret } define <2 x float> @test4(<2 x float> %A) nounwind { - %C = fadd <2 x float> %A, %A - ret <2 x float> %C ; X64-LABEL: test4: -; X64-NEXT: addps %xmm0, %xmm0 -; X64-NEXT: ret - -; W64-LABEL: test4: -; W64-NEXT: movaps (%rcx), %xmm0 -; W64-NEXT: addps %xmm0, %xmm0 -; W64-NEXT: ret - +; X64: # BB#0: +; X64-NEXT: addps %xmm0, %xmm0 +; X64-NEXT: retq +; ; X32-LABEL: test4: -; X32-NEXT: addps %xmm0, %xmm0 -; X32-NEXT: ret +; X32: # BB#0: +; X32-NEXT: addps %xmm0, %xmm0 +; X32-NEXT: retl + %C = fadd <2 x float> %A, %A + ret <2 x float> %C } define <4 x float> @test5(<4 x float> %A) nounwind { +; X64-LABEL: test5: +; X64: # BB#0: +; X64-NEXT: addps %xmm0, %xmm0 +; X64-NEXT: addps %xmm0, %xmm0 +; X64-NEXT: retq +; +; X32-LABEL: test5: +; X32: # BB#0: +; X32-NEXT: addps %xmm0, %xmm0 +; X32-NEXT: addps %xmm0, %xmm0 +; X32-NEXT: retl %B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1> %C = fadd <2 x float> %B, %B - br label %BB - + br label %BB + BB: - %D = fadd <2 x float> %C, %C + %D = fadd <2 x float> %C, %C %E = shufflevector <2 x float> %D, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> ret <4 x float> %E - -; X64-LABEL: test5: -; X64-NEXT: addps %xmm0, %xmm0 -; X64-NEXT: addps %xmm0, %xmm0 -; X64-NEXT: ret - -; W64-LABEL: test5: -; W64-NEXT: movaps (%rcx), %xmm0 -; W64-NEXT: addps %xmm0, %xmm0 -; W64-NEXT: addps %xmm0, %xmm0 -; W64-NEXT: ret - -; X32-LABEL: test5: -; X32-NEXT: addps %xmm0, %xmm0 -; X32-NEXT: addps %xmm0, %xmm0 -; X32-NEXT: ret } diff --git a/test/CodeGen/X86/vaargs.ll b/test/CodeGen/X86/vaargs.ll index ddeb7a336d4a..43c895eb39ef 100644 --- a/test/CodeGen/X86/vaargs.ll +++ b/test/CodeGen/X86/vaargs.ll @@ -1,4 +1,4 @@ -; RUN: llc -mcpu=corei7-avx %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=NO-FLAGS +; RUN: llc -verify-machineinstrs -mcpu=corei7-avx %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=NO-FLAGS target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.9.0" diff --git a/test/CodeGen/X86/vararg-callee-cleanup.ll b/test/CodeGen/X86/vararg-callee-cleanup.ll new file mode 100644 index 000000000000..2dcf319a2080 --- /dev/null +++ b/test/CodeGen/X86/vararg-callee-cleanup.ll @@ -0,0 +1,54 @@ +; RUN: llc -mtriple=i686-pc-windows < %s | FileCheck %s + +target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" + +declare x86_thiscallcc void @thiscall_thunk(i8* %this, ...) +define i32 @call_varargs_thiscall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) { + call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2) + call x86_thiscallcc void (i8*, ...)* @thiscall_thunk(i8* %a, i32 1, i32 2) + %t1 = add i32 %b, %c + %r = add i32 %t1, %d + ret i32 %r +} + +; CHECK: _call_varargs_thiscall_thunk: +; CHECK: calll _thiscall_thunk +; CHECK-NEXT: subl $8, %esp + +; We don't mangle the argument size into variadic callee cleanup functions. + +declare x86_stdcallcc void @stdcall_thunk(i8* %this, ...) +define i32 @call_varargs_stdcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) { + call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2) + call x86_stdcallcc void (i8*, ...)* @stdcall_thunk(i8* %a, i32 1, i32 2) + %t1 = add i32 %b, %c + %r = add i32 %t1, %d + ret i32 %r +} + +; CHECK: _call_varargs_stdcall_thunk: +; CHECK: calll _stdcall_thunk{{$}} +; CHECK-NEXT: subl $12, %esp + +declare x86_fastcallcc void @fastcall_thunk(i8* %this, ...) +define i32 @call_varargs_fastcall_thunk(i8* %a, i32 %b, i32 %c, i32 %d) { + call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2) + call x86_fastcallcc void (i8*, ...)* @fastcall_thunk(i8* inreg %a, i32 inreg 1, i32 2) + %t1 = add i32 %b, %c + %r = add i32 %t1, %d + ret i32 %r +} + +; CHECK: _call_varargs_fastcall_thunk: +; CHECK: calll @fastcall_thunk{{$}} +; CHECK-NEXT: subl $4, %esp + +; If you actually return from such a thunk, it will only pop the non-variadic +; portion of the arguments, which is different from what the callee passes. + +define x86_stdcallcc void @varargs_stdcall_return(i32, i32, ...) { + ret void +} + +; CHECK: _varargs_stdcall_return: +; CHECK: retl $8 diff --git a/test/CodeGen/X86/vararg_no_start.ll b/test/CodeGen/X86/vararg_no_start.ll new file mode 100644 index 000000000000..ab5c6fc58aa2 --- /dev/null +++ b/test/CodeGen/X86/vararg_no_start.ll @@ -0,0 +1,9 @@ +; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s + +define void @foo(i8*, ...) { + ret void +} +; CHECK-LABEL: {{^_?}}foo: +; CHECK-NOT: movq +; CHECK: retq diff --git a/test/CodeGen/X86/vastart-defs-eflags.ll b/test/CodeGen/X86/vastart-defs-eflags.ll index 6017753fc8fd..d0c515089f48 100644 --- a/test/CodeGen/X86/vastart-defs-eflags.ll +++ b/test/CodeGen/X86/vastart-defs-eflags.ll @@ -14,6 +14,7 @@ entry: br i1 %tobool, label %if.end, label %if.then if.then: ; preds = %entry + call void @llvm.va_start(i8* null) br label %if.end if.end: ; preds = %entry, %if.then @@ -21,3 +22,4 @@ if.end: ; preds = %entry, %if.then ret i32 %hasflag } +declare void @llvm.va_start(i8*) nounwind diff --git a/test/CodeGen/X86/vec-loadsingles-alignment.ll b/test/CodeGen/X86/vec-loadsingles-alignment.ll new file mode 100644 index 000000000000..6aa2adb228e1 --- /dev/null +++ b/test/CodeGen/X86/vec-loadsingles-alignment.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s + +@e = global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8], align 16 +@d = global [8 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16 + +; The global 'e' has 16 byte alignment, so make sure we don't generate an +; aligned 32-byte load instruction when we combine the load+insert sequence. + +define i32 @subb() nounwind ssp { +; CHECK-LABEL: subb: +; CHECK: vmovups e(%rip), %ymm +entry: + %0 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 7), align 4 + %1 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 6), align 8 + %2 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 5), align 4 + %3 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 4), align 16 + %4 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 3), align 4 + %5 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 2), align 8 + %6 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 1), align 4 + %7 = load i32* getelementptr inbounds ([8 x i32]* @e, i64 0, i64 0), align 16 + %vecinit.i = insertelement <8 x i32> undef, i32 %7, i32 0 + %vecinit1.i = insertelement <8 x i32> %vecinit.i, i32 %6, i32 1 + %vecinit2.i = insertelement <8 x i32> %vecinit1.i, i32 %5, i32 2 + %vecinit3.i = insertelement <8 x i32> %vecinit2.i, i32 %4, i32 3 + %vecinit4.i = insertelement <8 x i32> %vecinit3.i, i32 %3, i32 4 + %vecinit5.i = insertelement <8 x i32> %vecinit4.i, i32 %2, i32 5 + %vecinit6.i = insertelement <8 x i32> %vecinit5.i, i32 %1, i32 6 + %vecinit7.i = insertelement <8 x i32> %vecinit6.i, i32 %0, i32 7 + %8 = bitcast <8 x i32> %vecinit7.i to <32 x i8> + tail call void @llvm.x86.avx.storeu.dq.256(i8* bitcast ([8 x i32]* @d to i8*), <32 x i8> %8) + ret i32 0 +} + +declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind + diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll index 1a6c05dd9f41..8600c48aaac1 100644 --- a/test/CodeGen/X86/vec_cast2.ll +++ b/test/CodeGen/X86/vec_cast2.ll @@ -1,75 +1,177 @@ ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE -;CHECK-LABEL: foo1_8: -;CHECK: vcvtdq2ps -;CHECK: ret -; -;CHECK-WIDE-LABEL: foo1_8: -;CHECK-WIDE: vpmovzxbd %xmm0, %xmm1 -;CHECK-WIDE-NEXT: vpslld $24, %xmm1, %xmm1 -;CHECK-WIDE-NEXT: vpsrad $24, %xmm1, %xmm1 -;CHECK-WIDE-NEXT: vpshufb {{.*}}, %xmm0, %xmm0 -;CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0 -;CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0 -;CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -;CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 -;CHECK-WIDE-NEXT: ret define <8 x float> @foo1_8(<8 x i8> %src) { +; CHECK-LABEL: foo1_8: +; CHECK: ## BB#0: +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0 +; CHECK-NEXT: vpslld $24, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-NEXT: vpslld $24, %xmm1, %xmm1 +; CHECK-NEXT: vpsrad $24, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: foo1_8: +; CHECK-WIDE: ## BB#0: +; CHECK-WIDE-NEXT: vpmovzxbd %xmm0, %xmm1 +; CHECK-WIDE-NEXT: vpslld $24, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpsrad $24, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-WIDE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-WIDE-NEXT: retl %res = sitofp <8 x i8> %src to <8 x float> ret <8 x float> %res } -;CHECK-LABEL: foo1_4: -;CHECK: vcvtdq2ps -;CHECK: ret -; -;CHECK-WIDE-LABEL: foo1_4: -;CHECK-WIDE: vpmovzxbd %xmm0, %xmm0 -;CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0 -;CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0 -;CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 -;CHECK-WIDE-NEXT: ret define <4 x float> @foo1_4(<4 x i8> %src) { +; CHECK-LABEL: foo1_4: +; CHECK: ## BB#0: +; CHECK-NEXT: vpslld $24, %xmm0, %xmm0 +; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: foo1_4: +; CHECK-WIDE: ## BB#0: +; CHECK-WIDE-NEXT: vpmovzxbd %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpslld $24, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpsrad $24, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = sitofp <4 x i8> %src to <4 x float> ret <4 x float> %res } -;CHECK-LABEL: foo2_8: -;CHECK: vcvtdq2ps -;CHECK: ret -; -;CHECK-WIDE-LABEL: foo2_8: -;CHECK-WIDE: vcvtdq2ps %ymm{{.*}}, %ymm{{.*}} -;CHECK-WIDE: ret define <8 x float> @foo2_8(<8 x i8> %src) { +; CHECK-LABEL: foo2_8: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovzxwd %xmm0, %xmm1 +; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vandps LCPI2_0, %ymm0, %ymm0 +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: foo2_8: +; CHECK-WIDE: ## BB#0: +; CHECK-WIDE-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-WIDE-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; CHECK-WIDE-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; CHECK-WIDE-NEXT: vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; CHECK-WIDE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; CHECK-WIDE-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; CHECK-WIDE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-WIDE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 +; CHECK-WIDE-NEXT: retl %res = uitofp <8 x i8> %src to <8 x float> ret <8 x float> %res } -;CHECK-LABEL: foo2_4: -;CHECK: vcvtdq2ps -;CHECK: ret -; -;CHECK-WIDE-LABEL: foo2_4: -;CHECK-WIDE: vcvtdq2ps %xmm{{.*}}, %xmm{{.*}} -;CHECK-WIDE: ret define <4 x float> @foo2_4(<4 x i8> %src) { +; CHECK-LABEL: foo2_4: +; CHECK: ## BB#0: +; CHECK-NEXT: vandps LCPI3_0, %xmm0, %xmm0 +; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: foo2_4: +; CHECK-WIDE: ## BB#0: +; CHECK-WIDE-NEXT: vpmovzxbd %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = uitofp <4 x i8> %src to <4 x float> ret <4 x float> %res } -;CHECK-LABEL: foo3_8: -;CHECK: vcvttps2dq -;CHECK: ret define <8 x i8> @foo3_8(<8 x float> %src) { +; CHECK-LABEL: foo3_8: +; CHECK: ## BB#0: +; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: foo3_8: +; CHECK-WIDE: ## BB#0: +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax +; CHECK-WIDE-NEXT: shll $8, %eax +; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx +; CHECK-WIDE-NEXT: movzbl %cl, %ecx +; CHECK-WIDE-NEXT: orl %eax, %ecx +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax +; CHECK-WIDE-NEXT: shll $8, %eax +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx +; CHECK-WIDE-NEXT: movzbl %dl, %edx +; CHECK-WIDE-NEXT: orl %eax, %edx +; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 +; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: shll $8, %eax +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx +; CHECK-WIDE-NEXT: movzbl %cl, %ecx +; CHECK-WIDE-NEXT: orl %eax, %ecx +; CHECK-WIDE-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: shll $8, %eax +; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx +; CHECK-WIDE-NEXT: movzbl %cl, %ecx +; CHECK-WIDE-NEXT: orl %eax, %ecx +; CHECK-WIDE-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: vzeroupper +; CHECK-WIDE-NEXT: retl %res = fptosi <8 x float> %src to <8 x i8> ret <8 x i8> %res } -;CHECK-LABEL: foo3_4: -;CHECK: vcvttps2dq -;CHECK: ret + define <4 x i8> @foo3_4(<4 x float> %src) { +; CHECK-LABEL: foo3_4: +; CHECK: ## BB#0: +; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: retl +; +; CHECK-WIDE-LABEL: foo3_4: +; CHECK-WIDE: ## BB#0: +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax +; CHECK-WIDE-NEXT: shll $8, %eax +; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx +; CHECK-WIDE-NEXT: movzbl %cl, %ecx +; CHECK-WIDE-NEXT: orl %eax, %ecx +; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax +; CHECK-WIDE-NEXT: shll $8, %eax +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx +; CHECK-WIDE-NEXT: movzbl %dl, %edx +; CHECK-WIDE-NEXT: orl %eax, %edx +; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: retl %res = fptosi <4 x float> %src to <4 x i8> ret <4 x i8> %res } diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll deleted file mode 100644 index 4da79538dbf6..000000000000 --- a/test/CodeGen/X86/vec_compare-2.ll +++ /dev/null @@ -1,30 +0,0 @@ -; RUN: llc < %s -mtriple=i686-linux -mcpu=penryn | FileCheck %s - -declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone - -declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone - -declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone - -define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) { -entry: -; CHECK: cfi_def_cfa_offset -; CHECK-NOT: set -; CHECK: pmovzxwq -; CHECK: pshufb - %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1] - %cmp318.i = sext <4 x i1> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1] - %sub322.i = sub <4 x i32> %shr.i, zeroinitializer ; <<4 x i32>> [#uses=1] - %cmp323.x = icmp slt <4 x i32> zeroinitializer, %sub322.i ; <<4 x i1>> [#uses=1] - %cmp323.i = sext <4 x i1> %cmp323.x to <4 x i32> ; <<4 x i32>> [#uses=1] - %or.i = or <4 x i32> %cmp318.i, %cmp323.i ; <<4 x i32>> [#uses=1] - %tmp10.i83.i = bitcast <4 x i32> %or.i to <4 x float> ; <<4 x float>> [#uses=1] - %0 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> undef, <4 x float> undef, <4 x float> %tmp10.i83.i) nounwind ; <<4 x float>> [#uses=1] - %conv.i.i15.i = bitcast <4 x float> %0 to <4 x i32> ; <<4 x i32>> [#uses=1] - %swz.i.i28.i = shufflevector <4 x i32> %conv.i.i15.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1> ; <<2 x i32>> [#uses=1] - %tmp6.i29.i = bitcast <2 x i32> %swz.i.i28.i to <4 x i16> ; <<4 x i16>> [#uses=1] - %swz.i30.i = shufflevector <4 x i16> %tmp6.i29.i, <4 x i16> undef, <2 x i32> <i32 0, i32 1> ; <<2 x i16>> [#uses=1] - store <2 x i16> %swz.i30.i, <2 x i16>* undef - unreachable - ret void -} diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll index bddd53514643..318aca1d54cb 100644 --- a/test/CodeGen/X86/vec_ctbits.ll +++ b/test/CodeGen/X86/vec_ctbits.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=x86-64 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) @@ -7,12 +7,61 @@ declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) define <2 x i64> @footz(<2 x i64> %a) nounwind { %c = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) ret <2 x i64> %c + +; CHECK-LABEL: footz +; CHECK: bsfq +; CHECK: bsfq } define <2 x i64> @foolz(<2 x i64> %a) nounwind { %c = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 true) ret <2 x i64> %c + +; CHECK-LABEL: foolz +; CHECK: bsrq +; CHECK: xorq $63 +; CHECK: bsrq +; CHECK: xorq $63 } + define <2 x i64> @foopop(<2 x i64> %a) nounwind { %c = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) ret <2 x i64> %c } + +declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) +declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) +declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) + +define <2 x i32> @promtz(<2 x i32> %a) nounwind { + %c = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 false) + ret <2 x i32> %c + +; CHECK: .quad 4294967296 +; CHECK: .quad 4294967296 +; CHECK-LABEL: promtz +; CHECK: bsfq +; CHECK: cmov +; CHECK: bsfq +; CHECK: cmov +} +define <2 x i32> @promlz(<2 x i32> %a) nounwind { + %c = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) + ret <2 x i32> %c + +; CHECK: .quad 4294967295 +; CHECK: .quad 4294967295 +; CHECK: .quad 32 +; CHECK: .quad 32 +; CHECK-LABEL: promlz +; CHECK: pand +; CHECK: bsrq +; CHECK: xorq $63 +; CHECK: bsrq +; CHECK: xorq $63 +; CHECK: psub +} + +define <2 x i32> @prompop(<2 x i32> %a) nounwind { + %c = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) + ret <2 x i32> %c +} diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll new file mode 100644 index 000000000000..fbb84170dc83 --- /dev/null +++ b/test/CodeGen/X86/vec_extract-avx.ll @@ -0,0 +1,82 @@ +target triple = "x86_64-unknown-unknown" + +; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s + +; When extracting multiple consecutive elements from a larger +; vector into a smaller one, do it efficiently. We should use +; an EXTRACT_SUBVECTOR node internally rather than a bunch of +; single element extractions. + +; Extracting the low elements only requires using the right kind of store. +define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { + %ext0 = extractelement <8 x float> %v, i32 0 + %ext1 = extractelement <8 x float> %v, i32 1 + %ext2 = extractelement <8 x float> %v, i32 2 + %ext3 = extractelement <8 x float> %v, i32 3 + %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 + %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 + %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 + %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 + store <4 x float> %ins3, <4 x float>* %ptr, align 16 + ret void + +; CHECK-LABEL: low_v8f32_to_v4f32 +; CHECK: vmovaps +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Extracting the high elements requires just one AVX instruction. +define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { + %ext0 = extractelement <8 x float> %v, i32 4 + %ext1 = extractelement <8 x float> %v, i32 5 + %ext2 = extractelement <8 x float> %v, i32 6 + %ext3 = extractelement <8 x float> %v, i32 7 + %ins0 = insertelement <4 x float> undef, float %ext0, i32 0 + %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1 + %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2 + %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3 + store <4 x float> %ins3, <4 x float>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v8f32_to_v4f32 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Make sure element type doesn't alter the codegen. Note that +; if we were actually using the vector in this function and +; have AVX2, we should generate vextracti128 (the int version). +define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) { + %ext0 = extractelement <8 x i32> %v, i32 4 + %ext1 = extractelement <8 x i32> %v, i32 5 + %ext2 = extractelement <8 x i32> %v, i32 6 + %ext3 = extractelement <8 x i32> %v, i32 7 + %ins0 = insertelement <4 x i32> undef, i32 %ext0, i32 0 + %ins1 = insertelement <4 x i32> %ins0, i32 %ext1, i32 1 + %ins2 = insertelement <4 x i32> %ins1, i32 %ext2, i32 2 + %ins3 = insertelement <4 x i32> %ins2, i32 %ext3, i32 3 + store <4 x i32> %ins3, <4 x i32>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v8i32_to_v4i32 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} + +; Make sure that element size doesn't alter the codegen. +define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) { + %ext0 = extractelement <4 x double> %v, i32 2 + %ext1 = extractelement <4 x double> %v, i32 3 + %ins0 = insertelement <2 x double> undef, double %ext0, i32 0 + %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1 + store <2 x double> %ins1, <2 x double>* %ptr, align 16 + ret void + +; CHECK-LABEL: high_v4f64_to_v2f64 +; CHECK: vextractf128 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +} diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll index 747c8a8e8d02..530911add121 100644 --- a/test/CodeGen/X86/vec_extract-sse4.ll +++ b/test/CodeGen/X86/vec_extract-sse4.ll @@ -1,11 +1,12 @@ ; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 | FileCheck %s define void @t1(float* %R, <4 x float>* %P1) nounwind { -; CHECK-LABEL: @t1 -; CHECK: movl 4(%esp), %[[R0:e[abcd]x]] -; CHECK-NEXT: movl 8(%esp), %[[R1:e[abcd]x]] -; CHECK-NEXT: movl 12(%[[R1]]), %[[R2:e[abcd]x]] -; CHECK-NEXT: movl %[[R2]], (%[[R0]]) +; CHECK-LABEL: t1: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movss 12(%ecx), %xmm0 +; CHECK-NEXT: movss %xmm0, (%eax) ; CHECK-NEXT: retl %X = load <4 x float>* %P1 @@ -15,9 +16,15 @@ define void @t1(float* %R, <4 x float>* %P1) nounwind { } define float @t2(<4 x float>* %P1) nounwind { -; CHECK-LABEL: @t2 -; CHECK: movl 4(%esp), %[[R0:e[abcd]x]] -; CHECK-NEXT: flds 8(%[[R0]]) +; CHECK-LABEL: t2: +; CHECK: # BB#0: +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movapd (%eax), %xmm0 +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax ; CHECK-NEXT: retl %X = load <4 x float>* %P1 @@ -26,11 +33,12 @@ define float @t2(<4 x float>* %P1) nounwind { } define void @t3(i32* %R, <4 x i32>* %P1) nounwind { -; CHECK-LABEL: @t3 -; CHECK: movl 4(%esp), %[[R0:e[abcd]x]] -; CHECK-NEXT: movl 8(%esp), %[[R1:e[abcd]x]] -; CHECK-NEXT: movl 12(%[[R1]]), %[[R2:e[abcd]x]] -; CHECK-NEXT: movl %[[R2]], (%[[R0]]) +; CHECK-LABEL: t3: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl 12(%ecx), %ecx +; CHECK-NEXT: movl %ecx, (%eax) ; CHECK-NEXT: retl %X = load <4 x i32>* %P1 @@ -40,9 +48,10 @@ define void @t3(i32* %R, <4 x i32>* %P1) nounwind { } define i32 @t4(<4 x i32>* %P1) nounwind { -; CHECK-LABEL: @t4 -; CHECK: movl 4(%esp), %[[R0:e[abcd]x]] -; CHECK-NEXT: movl 12(%[[R0]]), %eax +; CHECK-LABEL: t4: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl 12(%eax), %eax ; CHECK-NEXT: retl %X = load <4 x i32>* %P1 diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll index 88f5a585b9fd..6df7be7a087b 100644 --- a/test/CodeGen/X86/vec_extract.ll +++ b/test/CodeGen/X86/vec_extract.ll @@ -1,10 +1,17 @@ -; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 -o %t -; RUN: grep movss %t | count 4 -; RUN: grep movhlps %t | count 1 -; RUN: not grep pshufd %t -; RUN: grep unpckhpd %t | count 1 +; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" define void @test1(<4 x float>* %F, float* %f) nounwind { +; CHECK-LABEL: test1: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movaps (%ecx), %xmm0 +; CHECK-NEXT: addps %xmm0, %xmm0 +; CHECK-NEXT: movss %xmm0, (%eax) +; CHECK-NEXT: retl +entry: %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=2] %tmp7 = fadd <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1] %tmp2 = extractelement <4 x float> %tmp7, i32 0 ; <float> [#uses=1] @@ -13,6 +20,18 @@ define void @test1(<4 x float>* %F, float* %f) nounwind { } define float @test2(<4 x float>* %F, float* %f) nounwind { +; CHECK-LABEL: test2: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movaps (%eax), %xmm0 +; CHECK-NEXT: addps %xmm0, %xmm0 +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: movss %xmm0, (%esp) +; CHECK-NEXT: flds (%esp) +; CHECK-NEXT: popl %eax +; CHECK-NEXT: retl +entry: %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=2] %tmp7 = fadd <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1] %tmp2 = extractelement <4 x float> %tmp7, i32 2 ; <float> [#uses=1] @@ -20,6 +39,14 @@ define float @test2(<4 x float>* %F, float* %f) nounwind { } define void @test3(float* %R, <4 x float>* %P1) nounwind { +; CHECK-LABEL: test3: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movss 12(%ecx), %xmm0 +; CHECK-NEXT: movss %xmm0, (%eax) +; CHECK-NEXT: retl +entry: %X = load <4 x float>* %P1 ; <<4 x float>> [#uses=1] %tmp = extractelement <4 x float> %X, i32 3 ; <float> [#uses=1] store float %tmp, float* %R @@ -27,6 +54,17 @@ define void @test3(float* %R, <4 x float>* %P1) nounwind { } define double @test4(double %A) nounwind { +; CHECK-LABEL: test4: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: subl $12, %esp +; CHECK-NEXT: calll foo +; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: addsd {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: movsd %xmm0, (%esp) +; CHECK-NEXT: fldl (%esp) +; CHECK-NEXT: addl $12, %esp +; CHECK-NEXT: retl +entry: %tmp1 = call <2 x double> @foo( ) ; <<2 x double>> [#uses=1] %tmp2 = extractelement <2 x double> %tmp1, i32 1 ; <double> [#uses=1] %tmp3 = fadd double %tmp2, %A ; <double> [#uses=1] diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll index 4c14a9602d41..ac02acfed342 100644 --- a/test/CodeGen/X86/vec_fabs.ll +++ b/test/CodeGen/X86/vec_fabs.ll @@ -38,21 +38,38 @@ define <8 x float> @fabs_v8f32(<8 x float> %p) declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p) ; PR20354: when generating code for a vector fabs op, -; make sure the correct mask is used for all vector elements. -; CHECK-LABEL: .LCPI4_0: -; CHECK-NEXT: .long 2147483647 -; CHECK-NEXT: .long 2147483647 -define i64 @fabs_v2f32(<2 x float> %v) { -; CHECK-LABEL: fabs_v2f32: -; CHECK: movabsq $-9223372034707292160, %[[R:r[^ ]+]] -; CHECK-NEXT: vmovq %[[R]], %[[X:xmm[0-9]+]] -; CHECK-NEXT: vandps {{.*}}.LCPI4_0{{.*}}, %[[X]], %[[X]] -; CHECK-NEXT: vmovq %[[X]], %rax -; CHECK-NEXT: retq - %highbits = bitcast i64 9223372039002259456 to <2 x float> ; 0x8000_0000_8000_0000 - %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %highbits) - %ret = bitcast <2 x float> %fabs to i64 - ret i64 %ret +; make sure that we're only turning off the sign bit of each float value. +; No constant pool loads or vector ops are needed for the fabs of a +; bitcasted integer constant; we should just return an integer constant +; that has the sign bits turned off. +; +; So instead of something like this: +; movabsq (constant pool load of mask for sign bits) +; vmovq (move from integer register to vector/fp register) +; vandps (mask off sign bits) +; vmovq (move vector/fp register back to integer return register) +; +; We should generate: +; mov (put constant value in return register) + +define i64 @fabs_v2f32_1() { +; CHECK-LABEL: fabs_v2f32_1: +; CHECK: movabsq $9223372032559808512, %rax # imm = 0x7FFFFFFF00000000 +; CHECK-NEXT: retq + %bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000 + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast) + %ret = bitcast <2 x float> %fabs to i64 + ret i64 %ret +} + +define i64 @fabs_v2f32_2() { +; CHECK-LABEL: fabs_v2f32_2: +; CHECK: movl $2147483647, %eax # imm = 0x7FFFFFFF +; CHECK-NEXT: retq + %bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF + %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast) + %ret = bitcast <2 x float> %fabs to i64 + ret i64 %ret } declare <2 x float> @llvm.fabs.v2f32(<2 x float> %p) diff --git a/test/CodeGen/X86/vec_fneg.ll b/test/CodeGen/X86/vec_fneg.ll index d49c70e56391..9743f7148c69 100644 --- a/test/CodeGen/X86/vec_fneg.ll +++ b/test/CodeGen/X86/vec_fneg.ll @@ -1,11 +1,45 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s +; FNEG is defined as subtraction from -0.0. + +; This test verifies that we use an xor with a constant to flip the sign bits; no subtraction needed. define <4 x float> @t1(<4 x float> %Q) { - %tmp15 = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q - ret <4 x float> %tmp15 +; CHECK-LABEL: t1: +; CHECK: xorps {{.*}}LCPI0_0{{.*}}, %xmm0 +; CHECK-NEXT: retq + %tmp = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q + ret <4 x float> %tmp } +; This test verifies that we generate an FP subtraction because "0.0 - x" is not an fneg. define <4 x float> @t2(<4 x float> %Q) { - %tmp15 = fsub <4 x float> zeroinitializer, %Q - ret <4 x float> %tmp15 +; CHECK-LABEL: t2: +; CHECK: xorps %[[X:xmm[0-9]+]], %[[X]] +; CHECK-NEXT: subps %xmm0, %[[X]] +; CHECK-NEXT: movaps %[[X]], %xmm0 +; CHECK-NEXT: retq + %tmp = fsub <4 x float> zeroinitializer, %Q + ret <4 x float> %tmp +} + +; If we're bitcasting an integer to an FP vector, we should avoid the FPU/vector unit entirely. +; Make sure that we're flipping the sign bit and only the sign bit of each float. +; So instead of something like this: +; movd %rdi, %xmm0 +; xorps .LCPI2_0(%rip), %xmm0 +; +; We should generate: +; movabsq (put sign bit mask in integer register)) +; xorq (flip sign bits) +; movd (move to xmm return register) + +define <2 x float> @fneg_bitcast(i64 %i) { +; CHECK-LABEL: fneg_bitcast: +; CHECK: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000 +; CHECK-NEXT: xorq %rdi, %rax +; CHECK-NEXT: movd %rax, %xmm0 +; CHECK-NEXT: retq + %bitcast = bitcast i64 %i to <2 x float> + %fneg = fsub <2 x float> <float -0.0, float -0.0>, %bitcast + ret <2 x float> %fneg } diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll index 5cb9f694bd61..b72044aee30b 100644 --- a/test/CodeGen/X86/vec_insert-5.ll +++ b/test/CodeGen/X86/vec_insert-5.ll @@ -2,66 +2,87 @@ ; There are no MMX operations in @t1 define void @t1(i32 %a, x86_mmx* %P) nounwind { - %tmp12 = shl i32 %a, 12 - %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1 - %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0 - %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx - store x86_mmx %tmp23, x86_mmx* %P - ret void - ; CHECK-LABEL: t1: -; CHECK-NOT: %mm -; CHECK: shll $12 -; CHECK-NOT: %mm +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: shll $12, %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1] +; CHECK-NEXT: movlpd %xmm0, (%eax) +; CHECK-NEXT: retl + %tmp12 = shl i32 %a, 12 + %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1 + %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0 + %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx + store x86_mmx %tmp23, x86_mmx* %P + ret void } define <4 x float> @t2(<4 x float>* %P) nounwind { - %tmp1 = load <4 x float>* %P - %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 > - ret <4 x float> %tmp2 - ; CHECK-LABEL: t2: -; CHECK: pslldq $12 +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movaps (%eax), %xmm1 +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; CHECK-NEXT: retl + %tmp1 = load <4 x float>* %P + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 4, i32 4, i32 4, i32 0 > + ret <4 x float> %tmp2 } define <4 x float> @t3(<4 x float>* %P) nounwind { - %tmp1 = load <4 x float>* %P - %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 > - ret <4 x float> %tmp2 - ; CHECK-LABEL: t3: -; CHECK: psrldq $8 +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movaps (%eax), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,0] +; CHECK-NEXT: retl + %tmp1 = load <4 x float>* %P + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> zeroinitializer, <4 x i32> < i32 2, i32 3, i32 4, i32 4 > + ret <4 x float> %tmp2 } define <4 x float> @t4(<4 x float>* %P) nounwind { - %tmp1 = load <4 x float>* %P - %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 > - ret <4 x float> %tmp2 - ; CHECK-LABEL: t4: -; CHECK: psrldq $12 +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movaps (%eax), %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] +; CHECK-NEXT: retl + %tmp1 = load <4 x float>* %P + %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 7, i32 0, i32 0, i32 0 > + ret <4 x float> %tmp2 } define <16 x i8> @t5(<16 x i8> %x) nounwind { - %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17> - ret <16 x i8> %s - ; CHECK-LABEL: t5: -; CHECK: psrldq $1 +; CHECK: # BB#0: +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; CHECK-NEXT: retl + %s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17> + ret <16 x i8> %s } define <16 x i8> @t6(<16 x i8> %x) nounwind { - %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <16 x i8> %s - ; CHECK-LABEL: t6: -; CHECK: palignr $1 +; CHECK: # BB#0: +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; CHECK-NEXT: retl + %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %s } define <16 x i8> @t7(<16 x i8> %x) nounwind { - %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2> - ret <16 x i8> %s - ; CHECK-LABEL: t7: -; CHECK: pslldq $13 +; CHECK: # BB#0: +; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2] +; CHECK-NEXT: retl + %s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2> + ret <16 x i8> %s } diff --git a/test/CodeGen/X86/vec_insert-6.ll b/test/CodeGen/X86/vec_insert-6.ll deleted file mode 100644 index 4583e1925e59..000000000000 --- a/test/CodeGen/X86/vec_insert-6.ll +++ /dev/null @@ -1,9 +0,0 @@ -; REQUIRES: asserts -; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | grep pslldq -; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -mtriple=i686-apple-darwin9 -o /dev/null -stats -info-output-file - | grep asm-printer | grep 6 - -define <4 x float> @t3(<4 x float>* %P) nounwind { - %tmp1 = load <4 x float>* %P - %tmp2 = shufflevector <4 x float> zeroinitializer, <4 x float> %tmp1, <4 x i32> < i32 4, i32 4, i32 4, i32 0 > - ret <4 x float> %tmp2 -} diff --git a/test/CodeGen/X86/vec_insert.ll b/test/CodeGen/X86/vec_insert.ll deleted file mode 100644 index 0ed8f1052366..000000000000 --- a/test/CodeGen/X86/vec_insert.ll +++ /dev/null @@ -1,19 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | grep movss | count 1 -; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | not grep pinsrw - -define void @test(<4 x float>* %F, i32 %I) nounwind { - %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=1] - %f = sitofp i32 %I to float ; <float> [#uses=1] - %tmp1 = insertelement <4 x float> %tmp, float %f, i32 0 ; <<4 x float>> [#uses=2] - %tmp18 = fadd <4 x float> %tmp1, %tmp1 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp18, <4 x float>* %F - ret void -} - -define void @test2(<4 x float>* %F, i32 %I, float %g) nounwind { - %tmp = load <4 x float>* %F ; <<4 x float>> [#uses=1] - %f = sitofp i32 %I to float ; <float> [#uses=1] - %tmp1 = insertelement <4 x float> %tmp, float %f, i32 2 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp1, <4 x float>* %F - ret void -} diff --git a/test/CodeGen/X86/vec_loadsingles.ll b/test/CodeGen/X86/vec_loadsingles.ll index 8812c4f820c6..fd132a52b8f1 100644 --- a/test/CodeGen/X86/vec_loadsingles.ll +++ b/test/CodeGen/X86/vec_loadsingles.ll @@ -1,12 +1,145 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movq - -define <4 x float> @a(<4 x float> %a, float* nocapture %p) nounwind readonly { -entry: - %tmp1 = load float* %p - %vecins = insertelement <4 x float> undef, float %tmp1, i32 0 - %add.ptr = getelementptr float* %p, i32 1 - %tmp5 = load float* %add.ptr - %vecins7 = insertelement <4 x float> %vecins, float %tmp5, i32 1 - ret <4 x float> %vecins7 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=FAST32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=SLOW32 + +define <4 x float> @merge_2_floats(float* nocapture %p) nounwind readonly { + %tmp1 = load float* %p + %vecins = insertelement <4 x float> undef, float %tmp1, i32 0 + %add.ptr = getelementptr float* %p, i32 1 + %tmp5 = load float* %add.ptr + %vecins7 = insertelement <4 x float> %vecins, float %tmp5, i32 1 + ret <4 x float> %vecins7 + +; ALL-LABEL: merge_2_floats +; ALL: vmovq +; ALL-NEXT: retq +} + +; Test-case generated due to a crash when trying to treat loading the first +; two i64s of a <4 x i64> as a load of two i32s. +define <4 x i64> @merge_2_floats_into_4() { + %1 = load i64** undef, align 8 + %2 = getelementptr inbounds i64* %1, i64 0 + %3 = load i64* %2 + %4 = insertelement <4 x i64> undef, i64 %3, i32 0 + %5 = load i64** undef, align 8 + %6 = getelementptr inbounds i64* %5, i64 1 + %7 = load i64* %6 + %8 = insertelement <4 x i64> %4, i64 %7, i32 1 + %9 = shufflevector <4 x i64> %8, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + ret <4 x i64> %9 + +; ALL-LABEL: merge_2_floats_into_4 +; ALL: vmovups +; ALL-NEXT: retq +} + +define <4 x float> @merge_4_floats(float* %ptr) { + %a = load float* %ptr, align 8 + %vec = insertelement <4 x float> undef, float %a, i32 0 + %idx1 = getelementptr inbounds float* %ptr, i64 1 + %b = load float* %idx1, align 8 + %vec2 = insertelement <4 x float> %vec, float %b, i32 1 + %idx3 = getelementptr inbounds float* %ptr, i64 2 + %c = load float* %idx3, align 8 + %vec4 = insertelement <4 x float> %vec2, float %c, i32 2 + %idx5 = getelementptr inbounds float* %ptr, i64 3 + %d = load float* %idx5, align 8 + %vec6 = insertelement <4 x float> %vec4, float %d, i32 3 + ret <4 x float> %vec6 + +; ALL-LABEL: merge_4_floats +; ALL: vmovups +; ALL-NEXT: retq +} + +; PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 ) +; Make sure that 32-byte vectors are handled efficiently. +; If the target has slow 32-byte accesses, we should still generate +; 16-byte loads. + +define <8 x float> @merge_8_floats(float* %ptr) { + %a = load float* %ptr, align 4 + %vec = insertelement <8 x float> undef, float %a, i32 0 + %idx1 = getelementptr inbounds float* %ptr, i64 1 + %b = load float* %idx1, align 4 + %vec2 = insertelement <8 x float> %vec, float %b, i32 1 + %idx3 = getelementptr inbounds float* %ptr, i64 2 + %c = load float* %idx3, align 4 + %vec4 = insertelement <8 x float> %vec2, float %c, i32 2 + %idx5 = getelementptr inbounds float* %ptr, i64 3 + %d = load float* %idx5, align 4 + %vec6 = insertelement <8 x float> %vec4, float %d, i32 3 + %idx7 = getelementptr inbounds float* %ptr, i64 4 + %e = load float* %idx7, align 4 + %vec8 = insertelement <8 x float> %vec6, float %e, i32 4 + %idx9 = getelementptr inbounds float* %ptr, i64 5 + %f = load float* %idx9, align 4 + %vec10 = insertelement <8 x float> %vec8, float %f, i32 5 + %idx11 = getelementptr inbounds float* %ptr, i64 6 + %g = load float* %idx11, align 4 + %vec12 = insertelement <8 x float> %vec10, float %g, i32 6 + %idx13 = getelementptr inbounds float* %ptr, i64 7 + %h = load float* %idx13, align 4 + %vec14 = insertelement <8 x float> %vec12, float %h, i32 7 + ret <8 x float> %vec14 + +; ALL-LABEL: merge_8_floats + +; FAST32: vmovups +; FAST32-NEXT: retq + +; SLOW32: vmovups +; SLOW32-NEXT: vinsertf128 +; SLOW32-NEXT: retq +} + +define <4 x double> @merge_4_doubles(double* %ptr) { + %a = load double* %ptr, align 8 + %vec = insertelement <4 x double> undef, double %a, i32 0 + %idx1 = getelementptr inbounds double* %ptr, i64 1 + %b = load double* %idx1, align 8 + %vec2 = insertelement <4 x double> %vec, double %b, i32 1 + %idx3 = getelementptr inbounds double* %ptr, i64 2 + %c = load double* %idx3, align 8 + %vec4 = insertelement <4 x double> %vec2, double %c, i32 2 + %idx5 = getelementptr inbounds double* %ptr, i64 3 + %d = load double* %idx5, align 8 + %vec6 = insertelement <4 x double> %vec4, double %d, i32 3 + ret <4 x double> %vec6 + +; ALL-LABEL: merge_4_doubles +; FAST32: vmovups +; FAST32-NEXT: retq + +; SLOW32: vmovups +; SLOW32-NEXT: vinsertf128 +; SLOW32-NEXT: retq +} + +; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 ) +; Recognize and combine consecutive loads even when the +; first of the combined loads is offset from the base address. +define <4 x double> @merge_4_doubles_offset(double* %ptr) { + %arrayidx4 = getelementptr inbounds double* %ptr, i64 4 + %arrayidx5 = getelementptr inbounds double* %ptr, i64 5 + %arrayidx6 = getelementptr inbounds double* %ptr, i64 6 + %arrayidx7 = getelementptr inbounds double* %ptr, i64 7 + %e = load double* %arrayidx4, align 8 + %f = load double* %arrayidx5, align 8 + %g = load double* %arrayidx6, align 8 + %h = load double* %arrayidx7, align 8 + %vecinit4 = insertelement <4 x double> undef, double %e, i32 0 + %vecinit5 = insertelement <4 x double> %vecinit4, double %f, i32 1 + %vecinit6 = insertelement <4 x double> %vecinit5, double %g, i32 2 + %vecinit7 = insertelement <4 x double> %vecinit6, double %h, i32 3 + ret <4 x double> %vecinit7 + +; ALL-LABEL: merge_4_doubles_offset +; FAST32: vmovups +; FAST32-NEXT: retq + +; SLOW32: vmovups +; SLOW32-NEXT: vinsertf128 +; SLOW32-NEXT: retq } diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll index d1d7608a0411..a13c813ea7b0 100644 --- a/test/CodeGen/X86/vec_set-3.ll +++ b/test/CodeGen/X86/vec_set-3.ll @@ -1,17 +1,37 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -o %t -; RUN: grep pshufd %t | count 2 +; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn | FileCheck %s -define <4 x float> @test(float %a) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 ; <<4 x float>> [#uses=1] - %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] - %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp6 +define <4 x float> @test(float %a) { +; CHECK-LABEL: test: +; CHECK: insertps $29, {{.*}}, %xmm0 +; CHECK-NEXT: retl + +entry: + %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 + %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2 + %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3 + ret <4 x float> %tmp6 } -define <2 x i64> @test2(i32 %a) nounwind { - %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2 ; <<4 x i32>> [#uses=1] - %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3 ; <<4 x i32>> [#uses=1] - %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp10 +define <2 x i64> @test2(i32 %a) { +; CHECK-LABEL: test2: +; CHECK: movd {{.*}}, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] +; CHECK-NEXT: retl + +entry: + %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2 + %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3 + %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64> + ret <2 x i64> %tmp10 } +define <4 x float> @test3(<4 x float> %A) { +; CHECK-LABEL: test3: +; CHECK: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; CHECK-NEXT: retl + + %tmp0 = extractelement <4 x float> %A, i32 0 + %tmp1 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef >, float %tmp0, i32 1 + %tmp2 = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 2 + ret <4 x float> %tmp2 +} diff --git a/test/CodeGen/X86/vec_set-5.ll b/test/CodeGen/X86/vec_set-5.ll deleted file mode 100644 index f811a7404a27..000000000000 --- a/test/CodeGen/X86/vec_set-5.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t -; RUN: grep movlhps %t | count 1 -; RUN: grep movq %t | count 2 - -define <4 x float> @test1(float %a, float %b) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1] - %tmp6 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] - %tmp8 = insertelement <4 x float> %tmp6, float %b, i32 2 ; <<4 x float>> [#uses=1] - %tmp9 = insertelement <4 x float> %tmp8, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp9 -} - -define <4 x float> @test2(float %a, float %b) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 ; <<4 x float>> [#uses=1] - %tmp7 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1] - %tmp8 = insertelement <4 x float> %tmp7, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] - %tmp9 = insertelement <4 x float> %tmp8, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp9 -} - -define <2 x i64> @test3(i32 %a, i32 %b) nounwind { - %tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0 ; <<4 x i32>> [#uses=1] - %tmp6 = insertelement <4 x i32> %tmp, i32 %b, i32 1 ; <<4 x i32>> [#uses=1] - %tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2 ; <<4 x i32>> [#uses=1] - %tmp10 = insertelement <4 x i32> %tmp8, i32 0, i32 3 ; <<4 x i32>> [#uses=1] - %tmp11 = bitcast <4 x i32> %tmp10 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp11 -} diff --git a/test/CodeGen/X86/vec_set-9.ll b/test/CodeGen/X86/vec_set-9.ll deleted file mode 100644 index a73909097c11..000000000000 --- a/test/CodeGen/X86/vec_set-9.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mattr=-avx,-pad-short-functions | FileCheck %s - -; CHECK: test3 -; CHECK: movd -; CHECK-NOT: movd -; CHECK: {{movlhps.*%xmm0, %xmm0}} -; CHECK-NEXT: ret - -define <2 x i64> @test3(i64 %A) nounwind { -entry: - %B = insertelement <2 x i64> undef, i64 %A, i32 1 - ret <2 x i64> %B -} - diff --git a/test/CodeGen/X86/vec_set-E.ll b/test/CodeGen/X86/vec_set-E.ll deleted file mode 100644 index d78be669fc7f..000000000000 --- a/test/CodeGen/X86/vec_set-E.ll +++ /dev/null @@ -1,9 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movq - -define <4 x float> @t(float %X) nounwind { - %tmp11 = insertelement <4 x float> undef, float %X, i32 0 - %tmp12 = insertelement <4 x float> %tmp11, float %X, i32 1 - %tmp27 = insertelement <4 x float> %tmp12, float 0.000000e+00, i32 2 - %tmp28 = insertelement <4 x float> %tmp27, float 0.000000e+00, i32 3 - ret <4 x float> %tmp28 -} diff --git a/test/CodeGen/X86/vec_set-G.ll b/test/CodeGen/X86/vec_set-G.ll deleted file mode 100644 index 4a542feafaff..000000000000 --- a/test/CodeGen/X86/vec_set-G.ll +++ /dev/null @@ -1,9 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movss - -define fastcc void @t(<4 x float> %A) nounwind { - %tmp41896 = extractelement <4 x float> %A, i32 0 ; <float> [#uses=1] - %tmp14082 = insertelement <4 x float> < float 0.000000e+00, float undef, float undef, float undef >, float %tmp41896, i32 1 ; <<4 x float>> [#uses=1] - %tmp14083 = insertelement <4 x float> %tmp14082, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp14083, <4 x float>* null, align 16 - ret void -} diff --git a/test/CodeGen/X86/vec_set-I.ll b/test/CodeGen/X86/vec_set-I.ll deleted file mode 100644 index c5d6ab88a35d..000000000000 --- a/test/CodeGen/X86/vec_set-I.ll +++ /dev/null @@ -1,13 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s - -; CHECK-NOT: xorp -; CHECK: movd -; CHECK-NOT: xorp - -define void @t1() nounwind { - %tmp298.i.i = load <4 x float>* null, align 16 - %tmp304.i.i = bitcast <4 x float> %tmp298.i.i to <4 x i32> - %tmp305.i.i = and <4 x i32> %tmp304.i.i, < i32 -1, i32 0, i32 0, i32 0 > - store <4 x i32> %tmp305.i.i, <4 x i32>* null, align 16 - unreachable -} diff --git a/test/CodeGen/X86/vec_set-J.ll b/test/CodeGen/X86/vec_set-J.ll deleted file mode 100644 index d90ab85b8cf7..000000000000 --- a/test/CodeGen/X86/vec_set-J.ll +++ /dev/null @@ -1,10 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movss -; PR2472 - -define <4 x i32> @a(<4 x i32> %a) nounwind { -entry: - %vecext = extractelement <4 x i32> %a, i32 0 - insertelement <4 x i32> zeroinitializer, i32 %vecext, i32 0 - %add = add <4 x i32> %a, %0 - ret <4 x i32> %add -} diff --git a/test/CodeGen/X86/vec_setcc.ll b/test/CodeGen/X86/vec_setcc.ll index 322dbae0c89f..b69f90cd6e2f 100644 --- a/test/CodeGen/X86/vec_setcc.ll +++ b/test/CodeGen/X86/vec_setcc.ll @@ -62,8 +62,7 @@ define <8 x i16> @v8i16_icmp_ule(<8 x i16> %a, <8 x i16> %b) nounwind readnone s ; SSE2-LABEL: v8i16_icmp_ule: ; SSE2: psubusw %xmm1, %xmm0 ; SSE2: pxor %xmm1, %xmm1 -; SSE2: pcmpeqw %xmm0, %xmm1 -; SSE2: movdqa %xmm1, %xmm0 +; SSE2: pcmpeqw %xmm1, %xmm0 ; SSE41-LABEL: v8i16_icmp_ule: ; SSE41: pminuw %xmm0, %xmm1 @@ -106,8 +105,7 @@ define <4 x i32> @v4i32_icmp_ule(<4 x i32> %a, <4 x i32> %b) nounwind readnone s ; SSE2: pxor %xmm2, %xmm0 ; SSE2: pcmpgtd %xmm1, %xmm0 ; SSE2: pcmpeqd %xmm1, %xmm1 -; SSE2: pxor %xmm0, %xmm1 -; SSE2: movdqa %xmm1, %xmm0 +; SSE2: pxor %xmm1, %xmm0 ; SSE41-LABEL: v4i32_icmp_ule: ; SSE41: pminud %xmm0, %xmm1 diff --git a/test/CodeGen/X86/vec_sext.ll b/test/CodeGen/X86/vec_sext.ll deleted file mode 100644 index 776ddec2e63b..000000000000 --- a/test/CodeGen/X86/vec_sext.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc < %s -march=x86-64 -; PR 9267 - -define<4 x i32> @func_16_32() { - %F = load <4 x i16>* undef - %G = sext <4 x i16> %F to <4 x i32> - %H = load <4 x i16>* undef - %Y = sext <4 x i16> %H to <4 x i32> - %T = add <4 x i32> %Y, %G - store <4 x i32>%T , <4 x i32>* undef - ret <4 x i32> %T -} - -define<4 x i64> @func_16_64() { - %F = load <4 x i16>* undef - %G = sext <4 x i16> %F to <4 x i64> - %H = load <4 x i16>* undef - %Y = sext <4 x i16> %H to <4 x i64> - %T = xor <4 x i64> %Y, %G - store <4 x i64>%T , <4 x i64>* undef - ret <4 x i64> %T -} - -define<4 x i64> @func_32_64() { - %F = load <4 x i32>* undef - %G = sext <4 x i32> %F to <4 x i64> - %H = load <4 x i32>* undef - %Y = sext <4 x i32> %H to <4 x i64> - %T = or <4 x i64> %Y, %G - ret <4 x i64> %T -} - -define<4 x i16> @func_8_16() { - %F = load <4 x i8>* undef - %G = sext <4 x i8> %F to <4 x i16> - %H = load <4 x i8>* undef - %Y = sext <4 x i8> %H to <4 x i16> - %T = add <4 x i16> %Y, %G - ret <4 x i16> %T -} - -define<4 x i32> @func_8_32() { - %F = load <4 x i8>* undef - %G = sext <4 x i8> %F to <4 x i32> - %H = load <4 x i8>* undef - %Y = sext <4 x i8> %H to <4 x i32> - %T = sub <4 x i32> %Y, %G - ret <4 x i32> %T -} - -define<4 x i64> @func_8_64() { - %F = load <4 x i8>* undef - %G = sext <4 x i8> %F to <4 x i64> - %H = load <4 x i8>* undef - %Y = sext <4 x i8> %H to <4 x i64> - %T = add <4 x i64> %Y, %G - ret <4 x i64> %T -} - -define<4 x i32> @const_16_32() { - %G = sext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i32> - ret <4 x i32> %G -} - -define<4 x i64> @const_16_64() { - %G = sext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i64> - ret <4 x i64> %G -} - diff --git a/test/CodeGen/X86/vec_shuffle-11.ll b/test/CodeGen/X86/vec_shuffle-11.ll deleted file mode 100644 index 640745ae2645..000000000000 --- a/test/CodeGen/X86/vec_shuffle-11.ll +++ /dev/null @@ -1,11 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 -; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin | not grep mov - -define <4 x i32> @test() nounwind { - %tmp131 = call <2 x i64> @llvm.x86.sse2.psrl.dq( <2 x i64> < i64 -1, i64 -1 >, i32 96 ) ; <<2 x i64>> [#uses=1] - %tmp137 = bitcast <2 x i64> %tmp131 to <4 x i32> ; <<4 x i32>> [#uses=1] - %tmp138 = and <4 x i32> %tmp137, bitcast (<2 x i64> < i64 -1, i64 -1 > to <4 x i32>) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %tmp138 -} - -declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) diff --git a/test/CodeGen/X86/vec_shuffle-14.ll b/test/CodeGen/X86/vec_shuffle-14.ll deleted file mode 100644 index 8f2519728b77..000000000000 --- a/test/CodeGen/X86/vec_shuffle-14.ll +++ /dev/null @@ -1,70 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-32 -; RUN: llc < %s -march=x86-64 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-64 - -define <4 x i32> @t1(i32 %a) nounwind { -entry: - %tmp = insertelement <4 x i32> undef, i32 %a, i32 0 - %tmp6 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp, <4 x i32> < i32 4, i32 1, i32 2, i32 3 > ; <<4 x i32>> [#uses=1] - ret <4 x i32> %tmp6 - -; X86-32-LABEL: t1: -; X86-32: movd 4(%esp), %xmm0 - -; X86-64-LABEL: t1: -; X86-64: movd %e{{..}}, %xmm0 -} - -define <2 x i64> @t2(i64 %a) nounwind { -entry: - %tmp = insertelement <2 x i64> undef, i64 %a, i32 0 - %tmp6 = shufflevector <2 x i64> zeroinitializer, <2 x i64> %tmp, <2 x i32> < i32 2, i32 1 > ; <<4 x i32>> [#uses=1] - ret <2 x i64> %tmp6 - -; X86-32-LABEL: t2: -; X86-32: movq 4(%esp), %xmm0 - -; X86-64-LABEL: t2: -; X86-64: movd %r{{..}}, %xmm0 -} - -define <2 x i64> @t3(<2 x i64>* %a) nounwind { -entry: - %tmp4 = load <2 x i64>* %a, align 16 ; <<2 x i64>> [#uses=1] - %tmp6 = bitcast <2 x i64> %tmp4 to <4 x i32> ; <<4 x i32>> [#uses=1] - %tmp7 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp6, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x i32>> [#uses=1] - %tmp8 = bitcast <4 x i32> %tmp7 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp8 - -; X86-32-LABEL: t3: -; X86-32: movl 4(%esp) -; X86-32: movq - -; X86-64-LABEL: t3: -; X86-64: movq ({{.*}}), %xmm0 -} - -define <2 x i64> @t4(<2 x i64> %a) nounwind { -entry: - %tmp5 = bitcast <2 x i64> %a to <4 x i32> ; <<4 x i32>> [#uses=1] - %tmp6 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %tmp5, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x i32>> [#uses=1] - %tmp7 = bitcast <4 x i32> %tmp6 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp7 - -; X86-32-LABEL: t4: -; X86-32: movq %xmm0, %xmm0 - -; X86-64-LABEL: t4: -; X86-64: movq {{.*}}, %xmm0 -} - -define <2 x i64> @t5(<2 x i64> %a) nounwind { -entry: - %tmp6 = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <2 x i32> < i32 2, i32 1 > ; <<4 x i32>> [#uses=1] - ret <2 x i64> %tmp6 - -; X86-32-LABEL: t5: -; X86-32: movq %xmm0, %xmm0 - -; X86-64-LABEL: t5: -; X86-64: movq {{.*}}, %xmm0 -} diff --git a/test/CodeGen/X86/vec_shuffle-15.ll b/test/CodeGen/X86/vec_shuffle-15.ll deleted file mode 100644 index 5a9b8fd34579..000000000000 --- a/test/CodeGen/X86/vec_shuffle-15.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 - -define <2 x i64> @t00(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 0 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t01(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 1 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t02(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 2 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t03(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 0, i32 3 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t10(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 0 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t11(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 1 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t12(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 2 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t13(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 1, i32 3 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t20(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 0 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t21(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 1 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t22(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 2 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t23(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 2, i32 3 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t30(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 0 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t31(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 1 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t32(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 2 > - ret <2 x i64> %tmp -} - -define <2 x i64> @t33(<2 x i64> %a, <2 x i64> %b) nounwind { - %tmp = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> < i32 3, i32 3 > - ret <2 x i64> %tmp -} diff --git a/test/CodeGen/X86/vec_shuffle-16.ll b/test/CodeGen/X86/vec_shuffle-16.ll deleted file mode 100644 index 9aeb94289c87..000000000000 --- a/test/CodeGen/X86/vec_shuffle-16.ll +++ /dev/null @@ -1,43 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse,-sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i386-apple-darwin | FileCheck %s -check-prefix=sse2 - -; sse-LABEL: t1: -; sse2-LABEL: t1: -define <4 x float> @t1(<4 x float> %a, <4 x float> %b) nounwind { -; sse: shufps -; sse2: pshufd -; sse2-NEXT: ret - %tmp1 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer - ret <4 x float> %tmp1 -} - -; sse-LABEL: t2: -; sse2-LABEL: t2: -define <4 x float> @t2(<4 x float> %A, <4 x float> %B) nounwind { -; sse: shufps -; sse2: pshufd -; sse2-NEXT: ret - %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 3, i32 3, i32 3, i32 3 > - ret <4 x float> %tmp -} - -; sse-LABEL: t3: -; sse2-LABEL: t3: -define <4 x float> @t3(<4 x float> %A, <4 x float> %B) nounwind { -; sse: shufps -; sse2: pshufd -; sse2-NEXT: ret - %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 4, i32 4, i32 4, i32 4 > - ret <4 x float> %tmp -} - -; sse-LABEL: t4: -; sse2-LABEL: t4: -define <4 x float> @t4(<4 x float> %A, <4 x float> %B) nounwind { - -; sse: shufps -; sse2: pshufd -; sse2-NEXT: ret - %tmp = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 1, i32 3, i32 2, i32 0 > - ret <4 x float> %tmp -} diff --git a/test/CodeGen/X86/vec_shuffle-17.ll b/test/CodeGen/X86/vec_shuffle-17.ll deleted file mode 100644 index f2f96ba94af1..000000000000 --- a/test/CodeGen/X86/vec_shuffle-17.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-win32 -mattr=-avx | FileCheck %s -; CHECK-NOT: xor -; CHECK: movd {{%rdi|%rcx}}, %xmm0 -; CHECK-NOT: xor -; PR2108 - -define <2 x i64> @doload64(i64 %x) nounwind { -entry: - %tmp717 = bitcast i64 %x to double ; <double> [#uses=1] - %tmp8 = insertelement <2 x double> undef, double %tmp717, i32 0 ; <<2 x double>> [#uses=1] - %tmp9 = insertelement <2 x double> %tmp8, double 0.000000e+00, i32 1 ; <<2 x double>> [#uses=1] - %tmp11 = bitcast <2 x double> %tmp9 to <2 x i64> ; <<2 x i64>> [#uses=1] - ret <2 x i64> %tmp11 -} - diff --git a/test/CodeGen/X86/vec_shuffle-18.ll b/test/CodeGen/X86/vec_shuffle-18.ll deleted file mode 100644 index 1104a4a8856b..000000000000 --- a/test/CodeGen/X86/vec_shuffle-18.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8.8.0 | grep mov | count 7 - - %struct.vector4_t = type { <4 x float> } - -define void @swizzle(i8* %a, %struct.vector4_t* %b, %struct.vector4_t* %c) nounwind { -entry: - %tmp9 = getelementptr %struct.vector4_t* %b, i32 0, i32 0 ; <<4 x float>*> [#uses=2] - %tmp10 = load <4 x float>* %tmp9, align 16 ; <<4 x float>> [#uses=1] - %tmp14 = bitcast i8* %a to double* ; <double*> [#uses=1] - %tmp15 = load double* %tmp14 ; <double> [#uses=1] - %tmp16 = insertelement <2 x double> undef, double %tmp15, i32 0 ; <<2 x double>> [#uses=1] - %tmp18 = bitcast <2 x double> %tmp16 to <4 x float> ; <<4 x float>> [#uses=1] - %tmp19 = shufflevector <4 x float> %tmp10, <4 x float> %tmp18, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x float>> [#uses=1] - store <4 x float> %tmp19, <4 x float>* %tmp9, align 16 - %tmp28 = getelementptr %struct.vector4_t* %c, i32 0, i32 0 ; <<4 x float>*> [#uses=2] - %tmp29 = load <4 x float>* %tmp28, align 16 ; <<4 x float>> [#uses=1] - %tmp26 = getelementptr i8* %a, i32 8 ; <i8*> [#uses=1] - %tmp33 = bitcast i8* %tmp26 to double* ; <double*> [#uses=1] - %tmp34 = load double* %tmp33 ; <double> [#uses=1] - %tmp35 = insertelement <2 x double> undef, double %tmp34, i32 0 ; <<2 x double>> [#uses=1] - %tmp37 = bitcast <2 x double> %tmp35 to <4 x float> ; <<4 x float>> [#uses=1] - %tmp38 = shufflevector <4 x float> %tmp29, <4 x float> %tmp37, <4 x i32> < i32 4, i32 5, i32 2, i32 3 > ; <<4 x float>> [#uses=1] - store <4 x float> %tmp38, <4 x float>* %tmp28, align 16 - ret void -} diff --git a/test/CodeGen/X86/vec_shuffle-19.ll b/test/CodeGen/X86/vec_shuffle-19.ll deleted file mode 100644 index 48db8de0d936..000000000000 --- a/test/CodeGen/X86/vec_shuffle-19.ll +++ /dev/null @@ -1,9 +0,0 @@ -; REQUIRES: asserts -; RUN: llc < %s -o /dev/null -march=x86 -mcpu=penryn -mattr=+sse2 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 4 -; PR2485 - -define <4 x i32> @t(<4 x i32> %a, <4 x i32> %b) nounwind { -entry: - %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> < i32 4, i32 0, i32 0, i32 0 > ; <<4 x i32>> [#uses=1] - ret <4 x i32> %shuffle -} diff --git a/test/CodeGen/X86/vec_shuffle-20.ll b/test/CodeGen/X86/vec_shuffle-20.ll deleted file mode 100644 index 5a2c4449456b..000000000000 --- a/test/CodeGen/X86/vec_shuffle-20.ll +++ /dev/null @@ -1,8 +0,0 @@ -; REQUIRES: asserts -; RUN: llc < %s -o /dev/null -march=x86 -mcpu=corei7 -mtriple=i686-apple-darwin9 -stats -info-output-file - | grep asm-printer | grep 2 - -define <4 x float> @func(<4 x float> %fp0, <4 x float> %fp1) nounwind { -entry: - shufflevector <4 x float> %fp0, <4 x float> %fp1, <4 x i32> < i32 0, i32 1, i32 2, i32 7 > ; <<4 x float>>:0 [#uses=1] - ret <4 x float> %0 -} diff --git a/test/CodeGen/X86/vec_shuffle-22.ll b/test/CodeGen/X86/vec_shuffle-22.ll deleted file mode 100644 index 6807e4d63909..000000000000 --- a/test/CodeGen/X86/vec_shuffle-22.ll +++ /dev/null @@ -1,15 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=pentium-m | FileCheck %s - -define <4 x float> @t1(<4 x float> %a) nounwind { -; CHECK: movlhps - %tmp1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 > ; <<4 x float>> [#uses=1] - ret <4 x float> %tmp1 -} - -define <4 x i32> @t2(<4 x i32>* %a) nounwind { -; CHECK: pshufd -; CHECK: ret - %tmp1 = load <4 x i32>* %a - %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> < i32 0, i32 1, i32 0, i32 1 > ; <<4 x i32>> [#uses=1] - ret <4 x i32> %tmp2 -} diff --git a/test/CodeGen/X86/vec_shuffle-23.ll b/test/CodeGen/X86/vec_shuffle-23.ll deleted file mode 100644 index 24687359cc5a..000000000000 --- a/test/CodeGen/X86/vec_shuffle-23.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep punpck -; RUN: llc < %s -march=x86 -mattr=+sse2 | grep pshufd - -define i32 @t() nounwind { -entry: - %a = alloca <4 x i32> ; <<4 x i32>*> [#uses=2] - %b = alloca <4 x i32> ; <<4 x i32>*> [#uses=5] - store volatile <4 x i32> < i32 0, i32 1, i32 2, i32 3 >, <4 x i32>* %a - %tmp = load <4 x i32>* %a ; <<4 x i32>> [#uses=1] - store <4 x i32> %tmp, <4 x i32>* %b - %tmp1 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %tmp2 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %punpckldq = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x i32>> [#uses=1] - store <4 x i32> %punpckldq, <4 x i32>* %b - %tmp3 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %result = extractelement <4 x i32> %tmp3, i32 0 ; <i32> [#uses=1] - ret i32 %result -} diff --git a/test/CodeGen/X86/vec_shuffle-24.ll b/test/CodeGen/X86/vec_shuffle-24.ll deleted file mode 100644 index d038dafaf294..000000000000 --- a/test/CodeGen/X86/vec_shuffle-24.ll +++ /dev/null @@ -1,18 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s - -define i32 @t() nounwind optsize { -entry: -; CHECK: punpckldq - %a = alloca <4 x i32> ; <<4 x i32>*> [#uses=2] - %b = alloca <4 x i32> ; <<4 x i32>*> [#uses=5] - store volatile <4 x i32> < i32 0, i32 1, i32 2, i32 3 >, <4 x i32>* %a - %tmp = load <4 x i32>* %a ; <<4 x i32>> [#uses=1] - store <4 x i32> %tmp, <4 x i32>* %b - %tmp1 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %tmp2 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %punpckldq = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x i32>> [#uses=1] - store <4 x i32> %punpckldq, <4 x i32>* %b - %tmp3 = load <4 x i32>* %b ; <<4 x i32>> [#uses=1] - %result = extractelement <4 x i32> %tmp3, i32 0 ; <i32> [#uses=1] - ret i32 %result -} diff --git a/test/CodeGen/X86/vec_shuffle-25.ll b/test/CodeGen/X86/vec_shuffle-25.ll deleted file mode 100644 index 3f42a132ef2b..000000000000 --- a/test/CodeGen/X86/vec_shuffle-25.ll +++ /dev/null @@ -1,34 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=sse4.1 -o %t -; RUN: grep unpcklps %t | count 3 -; RUN: grep unpckhps %t | count 1 - -; Transpose example using the more generic vector shuffle. We return -; float8 instead of float16 since x86 can return that in register. -; ModuleID = 'transpose2_opt.bc' -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" -target triple = "i386-apple-cl.1.0" -@r0 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r1 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r2 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r3 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] - -define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind { -entry: - %unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] - %unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2] - %unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] - %unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2] - %unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - %unpcklps14a = shufflevector <4 x float> %unpcklps14, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] - %unpckhps17a = shufflevector <4 x float> %unpckhps17, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %r1 = shufflevector <16 x float> %unpcklps14a, <16 x float> %unpckhps17a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> - %unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - %unpcklps20a = shufflevector <4 x float> %unpcklps20, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %r2 = shufflevector <16 x float> %r1, <16 x float> %unpcklps20a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15> - %unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] - %unpckhps23a = shufflevector <4 x float> %unpckhps23, <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - %r3 = shufflevector <16 x float> %r2, <16 x float> %unpckhps23a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19> - %r4 = shufflevector <16 x float> %r3, <16 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - ret <8 x float> %r4 -} diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll deleted file mode 100644 index 00e8e73e184e..000000000000 --- a/test/CodeGen/X86/vec_shuffle-26.ll +++ /dev/null @@ -1,68 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse4.1 | FileCheck %s -; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s - -; Transpose example using the more generic vector shuffle. Return float8 -; instead of float16 -; ModuleID = 'transpose2_opt.bc' -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" -target triple = "i386-apple-cl.1.0" -@r0 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r1 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r2 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] -@r3 = common global <4 x float> zeroinitializer, align 16 ; <<4 x float>*> [#uses=1] - -define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind { -entry: -; CHECK: transpose2 -; CHECK: unpckhps -; CHECK: unpckhps -; CHECK: unpcklps -; CHECK: unpckhps -; Different instruction order for Atom. -; ATOM: transpose2 -; ATOM: unpckhps -; ATOM: unpckhps -; ATOM: unpckhps -; ATOM: unpcklps - %unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] - %unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2] - %unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] - %unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2] - %unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - %unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] - %r1 = shufflevector <4 x float> %unpcklps14, <4 x float> %unpckhps17, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > - %unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=1] - %unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] - %r2 = shufflevector <4 x float> %unpcklps20, <4 x float> %unpckhps23, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 > -; %r3 = shufflevector <8 x float> %r1, <8 x float> %r2, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; - ret <8 x float> %r2 -} - -define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind { -entry: -; movhps should happen before extractps to assure it gets the correct value. -; CHECK: lo_hi_shift -; CHECK: movhps ([[BASEREG:%[a-z]+]]), -; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) -; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) -; ATOM: lo_hi_shift -; ATOM: movhps ([[BASEREG:%[a-z]+]]), -; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) -; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) - %v.i = bitcast float* %y to <4 x float>* - %0 = load <4 x float>* %v.i, align 1 - %1 = bitcast float* %x to <1 x i64>* - %.val = load <1 x i64>* %1, align 1 - %2 = bitcast <1 x i64> %.val to <2 x float> - %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> - %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - %cast.i = bitcast <4 x float> %0 to <2 x i64> - %extract.i = extractelement <2 x i64> %cast.i, i32 1 - %3 = bitcast float* %x to i64* - store i64 %extract.i, i64* %3, align 4 - %4 = bitcast <4 x float> %0 to <16 x i8> - %5 = bitcast <4 x float> %shuffle1.i to <16 x i8> - %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> - %6 = bitcast <16 x i8> %palignr to <2 x i64> - ret <2 x i64> %6 -} diff --git a/test/CodeGen/X86/vec_shuffle-27.ll b/test/CodeGen/X86/vec_shuffle-27.ll deleted file mode 100644 index c9b2fb51d78f..000000000000 --- a/test/CodeGen/X86/vec_shuffle-27.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s - -; ModuleID = 'vec_shuffle-27.bc' -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" -target triple = "i686-apple-cl.1.0" - -define <8 x float> @my2filter4_1d(<4 x float> %a, <8 x float> %T0, <8 x float> %T1) nounwind readnone { -entry: -; CHECK: subps -; CHECK: subps -; CHECK: mulps -; CHECK: mulps -; CHECK: addps -; CHECK: addps - %tmp7 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3 > ; <<8 x float>> [#uses=1] - %sub = fsub <8 x float> %T1, %T0 ; <<8 x float>> [#uses=1] - %mul = fmul <8 x float> %sub, %tmp7 ; <<8 x float>> [#uses=1] - %add = fadd <8 x float> %mul, %T0 ; <<8 x float>> [#uses=1] - ret <8 x float> %add -} - -; Test case for r122206 -define void @test2(<4 x i64>* %ap, <4 x i64>* %bp) nounwind { -entry: -; CHECK: movdqa - %a = load <4 x i64> * %ap - %b = load <4 x i64> * %bp - %mulaa = mul <4 x i64> %a, %a - %mulbb = mul <4 x i64> %b, %b - %mulab = mul <4 x i64> %a, %b - %vect1271 = shufflevector <4 x i64> %mulaa, <4 x i64> %mulbb, <4 x i32> <i32 0, i32 4, i32 undef, i32 undef> - %vect1272 = shufflevector <4 x i64> %mulaa, <4 x i64> %mulbb, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef> - %vect1487 = shufflevector <4 x i64> %vect1271, <4 x i64> %mulab, <4 x i32> <i32 0, i32 1, i32 2, i32 4> - %vect1488 = shufflevector <4 x i64> %vect1272, <4 x i64> %mulab, <4 x i32> <i32 0, i32 1, i32 2, i32 5> - store <4 x i64> %vect1487, <4 x i64>* %ap - store <4 x i64> %vect1488, <4 x i64>* %bp - ret void; -} diff --git a/test/CodeGen/X86/vec_shuffle-28.ll b/test/CodeGen/X86/vec_shuffle-28.ll deleted file mode 100644 index ebf557762cb9..000000000000 --- a/test/CodeGen/X86/vec_shuffle-28.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s - -; CHECK: pshufb -; CHECK-NOT: pshufb - -; FIXME: this test has a superfluous punpcklqdq pre-pshufb currently. -; Don't XFAIL it because it's still better than the previous code. - -; Pack various elements via shuffles. -define <8 x i16> @shuf1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp7 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 1, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef , i32 undef > - ret <8 x i16> %tmp7 -} diff --git a/test/CodeGen/X86/vec_shuffle-30.ll b/test/CodeGen/X86/vec_shuffle-30.ll deleted file mode 100644 index f5f88426058c..000000000000 --- a/test/CodeGen/X86/vec_shuffle-30.ll +++ /dev/null @@ -1,26 +0,0 @@ -; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s - -; CHECK: test -; Test case when creating pshufhw, we incorrectly set the higher order bit -; for an undef, -define void @test(<8 x i16>* %dest, <8 x i16> %in) nounwind { -entry: -; CHECK-NOT: vmovaps -; CHECK: vmovlpd -; CHECK: vpshufhw $-95 - %0 = load <8 x i16>* %dest - %1 = shufflevector <8 x i16> %0, <8 x i16> %in, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 13, i32 undef, i32 14, i32 14> - store <8 x i16> %1, <8 x i16>* %dest - ret void -} - -; CHECK: test2 -; A test case where we shouldn't generate a punpckldq but a pshufd and a pslldq -define void @test2(<4 x i32>* %dest, <4 x i32> %in) nounwind { -entry: -; CHECK-NOT: pslldq -; CHECK: shufps - %0 = shufflevector <4 x i32> %in, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> < i32 undef, i32 5, i32 undef, i32 2> - store <4 x i32> %0, <4 x i32>* %dest - ret void -} diff --git a/test/CodeGen/X86/vec_shuffle-31.ll b/test/CodeGen/X86/vec_shuffle-31.ll deleted file mode 100644 index bb06e15425bb..000000000000 --- a/test/CodeGen/X86/vec_shuffle-31.ll +++ /dev/null @@ -1,8 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=core2 -o %t -; RUN: grep pshufb %t | count 1 - -define <8 x i16> @shuf3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 0, i32 1, i32 undef, i32 undef, i32 3, i32 11, i32 undef , i32 undef > - ret <8 x i16> %tmp9 -} diff --git a/test/CodeGen/X86/vec_shuffle-34.ll b/test/CodeGen/X86/vec_shuffle-34.ll deleted file mode 100644 index d057b3fa7ea8..000000000000 --- a/test/CodeGen/X86/vec_shuffle-34.ll +++ /dev/null @@ -1,7 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=core2 | grep pshufb | count 2 - -define <8 x i16> @shuf2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -entry: - %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef > - ret <8 x i16> %tmp8 -} diff --git a/test/CodeGen/X86/vec_shuffle-35.ll b/test/CodeGen/X86/vec_shuffle-35.ll deleted file mode 100644 index f5083b4b8011..000000000000 --- a/test/CodeGen/X86/vec_shuffle-35.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=yonah -stack-alignment=16 -o %t -; RUN: grep pextrw %t | count 12 -; RUN: grep pinsrw %t | count 13 -; RUN: grep rolw %t | count 13 -; RUN: not grep esp %t -; RUN: not grep ebp %t -; RUN: llc < %s -march=x86 -mcpu=core2 -stack-alignment=16 -o %t -; RUN: grep pshufb %t | count 3 - -define <16 x i8> @shuf1(<16 x i8> %T0) nounwind readnone { -entry: - %tmp8 = shufflevector <16 x i8> %T0, <16 x i8> undef, <16 x i32> < i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 > - ret <16 x i8> %tmp8 -} - -define <16 x i8> @shuf2(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { -entry: - %tmp8 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> < i32 undef, i32 undef, i32 3, i32 2, i32 17, i32 16, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 15 , i32 14 > - ret <16 x i8> %tmp8 -} diff --git a/test/CodeGen/X86/vec_shuffle-36.ll b/test/CodeGen/X86/vec_shuffle-36.ll deleted file mode 100644 index f1d0f939e60c..000000000000 --- a/test/CodeGen/X86/vec_shuffle-36.ll +++ /dev/null @@ -1,16 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=sse4.1 | FileCheck %s - -define <8 x i16> @shuf6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { -; CHECK: pshufb -; CHECK-NOT: pshufb -; CHECK: ret -entry: - %tmp9 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 3, i32 2, i32 0, i32 2, i32 1, i32 5, i32 6 , i32 undef > - ret <8 x i16> %tmp9 -} - -define <8 x i16> @shuf7(<8 x i16> %t0) { -; CHECK: pshufd - %tmp10 = shufflevector <8 x i16> %t0, <8 x i16> undef, <8 x i32> < i32 undef, i32 2, i32 2, i32 2, i32 2, i32 2, i32 undef, i32 undef > - ret <8 x i16> %tmp10 -} diff --git a/test/CodeGen/X86/vec_shuffle-37.ll b/test/CodeGen/X86/vec_shuffle-37.ll deleted file mode 100644 index ed285f93fe1b..000000000000 --- a/test/CodeGen/X86/vec_shuffle-37.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=core2 | FileCheck %s -; RUN: llc -O0 < %s -march=x86 -mcpu=core2 | FileCheck %s --check-prefix=CHECK_O0 - -define <4 x i32> @t00(<4 x i32>* %a0) nounwind ssp { -entry: -; CHECK: movaps ({{%rdi|%rcx}}), %[[XMM0:xmm[0-9]+]] -; CHECK: movaps %[[XMM0]], %[[XMM1:xmm[0-9]+]] -; CHECK-NEXT: movss %xmm{{[0-9]+}}, %[[XMM1]] -; CHECK-NEXT: shufps $36, %[[XMM1]], %[[XMM0]] - %0 = load <4 x i32>* undef, align 16 - %1 = load <4 x i32>* %a0, align 16 - %2 = shufflevector <4 x i32> %1, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> - ret <4 x i32> %2 -} - -define void @t01(double* %a0) nounwind ssp { -entry: -; CHECK_O0: movsd (%eax), %xmm0 -; CHECK_O0: unpcklpd %xmm0, %xmm0 - %tmp93 = load double* %a0, align 8 - %vecinit94 = insertelement <2 x double> undef, double %tmp93, i32 1 - store <2 x double> %vecinit94, <2 x double>* undef - ret void -} - -define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline { -entry: -; CHECK: t02 -; CHECK: movaps -; CHECK: shufps -; CHECK: pshufd -; CHECK: movq -; CHECK: ret - %0 = bitcast <8 x i32>* %source to <4 x i32>* - %arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3 - %tmp2 = load <4 x i32>* %arrayidx, align 16 - %tmp3 = extractelement <4 x i32> %tmp2, i32 0 - %tmp5 = insertelement <2 x i32> <i32 undef, i32 0>, i32 %tmp3, i32 0 - %arrayidx7 = getelementptr inbounds <8 x i32>* %source, i64 1 - %1 = bitcast <8 x i32>* %arrayidx7 to <4 x i32>* - %tmp8 = load <4 x i32>* %1, align 16 - %tmp9 = extractelement <4 x i32> %tmp8, i32 1 - %tmp11 = insertelement <2 x i32> %tmp5, i32 %tmp9, i32 1 - store <2 x i32> %tmp11, <2 x i32>* %dest, align 8 - ret void -} diff --git a/test/CodeGen/X86/vec_shuffle-38.ll b/test/CodeGen/X86/vec_shuffle-38.ll deleted file mode 100644 index ec196df7aeff..000000000000 --- a/test/CodeGen/X86/vec_shuffle-38.ll +++ /dev/null @@ -1,77 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s - -define <2 x double> @ld(<2 x double> %p) nounwind optsize ssp { -; CHECK: unpcklpd - %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> zeroinitializer - ret <2 x double> %shuffle -} - -define <2 x double> @hd(<2 x double> %p) nounwind optsize ssp { -; CHECK: unpckhpd - %shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> <i32 1, i32 1> - ret <2 x double> %shuffle -} - -define <2 x i64> @ldi(<2 x i64> %p) nounwind optsize ssp { -; CHECK: punpcklqdq - %shuffle = shufflevector <2 x i64> %p, <2 x i64> undef, <2 x i32> zeroinitializer - ret <2 x i64> %shuffle -} - -define <2 x i64> @hdi(<2 x i64> %p) nounwind optsize ssp { -; CHECK: punpckhqdq - %shuffle = shufflevector <2 x i64> %p, <2 x i64> undef, <2 x i32> <i32 1, i32 1> - ret <2 x i64> %shuffle -} - -; rdar://10050549 -%struct.Float2 = type { float, float } - -define <4 x float> @loadhpi(%struct.Float2* %vPtr, <4 x float> %vecin1) nounwind readonly ssp { -entry: -; CHECK: loadhpi -; CHECK-NOT: movq -; CHECK: movhps ( - %tmp1 = bitcast %struct.Float2* %vPtr to <1 x i64>* - %addptr7 = getelementptr inbounds <1 x i64>* %tmp1, i64 0 - %tmp2 = bitcast <1 x i64>* %addptr7 to float* - %tmp3 = load float* %tmp2, align 4 - %vec = insertelement <4 x float> undef, float %tmp3, i32 0 - %addptr.i12 = getelementptr inbounds float* %tmp2, i64 1 - %tmp4 = load float* %addptr.i12, align 4 - %vecin2 = insertelement <4 x float> %vec, float %tmp4, i32 1 - %shuffle = shufflevector <4 x float> %vecin1, <4 x float> %vecin2, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - ret <4 x float> %shuffle -} - -; rdar://10119696 -; CHECK: f -define <4 x float> @f(<4 x float> %x, double* nocapture %y) nounwind readonly ssp { -entry: - ; CHECK: movlps (%{{rdi|rdx}}), %xmm0 - %u110.i = load double* %y, align 1 - %tmp8.i = insertelement <2 x double> undef, double %u110.i, i32 0 - %tmp9.i = bitcast <2 x double> %tmp8.i to <4 x float> - %shuffle.i = shufflevector <4 x float> %x, <4 x float> %tmp9.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3> - ret <4 x float> %shuffle.i -} - -define <4 x float> @loadhpi2(%struct.Float2* nocapture %vHiCoefPtr_0, %struct.Float2* nocapture %vLoCoefPtr_0, i32 %s) nounwind readonly ssp { -entry: -; CHECK: loadhpi2 -; CHECK: movhps ( -; CHECK-NOT: movlhps - %0 = bitcast %struct.Float2* %vHiCoefPtr_0 to <1 x i64>* - %idx.ext = sext i32 %s to i64 - %add.ptr = getelementptr inbounds <1 x i64>* %0, i64 %idx.ext - %add.ptr.val = load <1 x i64>* %add.ptr, align 1 - %1 = bitcast <1 x i64> %add.ptr.val to <2 x float> - %shuffle.i = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> - %2 = bitcast %struct.Float2* %vLoCoefPtr_0 to <1 x i64>* - %add.ptr2 = getelementptr inbounds <1 x i64>* %2, i64 %idx.ext - %add.ptr2.val = load <1 x i64>* %add.ptr2, align 1 - %3 = bitcast <1 x i64> %add.ptr2.val to <2 x float> - %shuffle.i4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> - %shuffle1.i5 = shufflevector <4 x float> %shuffle.i, <4 x float> %shuffle.i4, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - ret <4 x float> %shuffle1.i5 -} diff --git a/test/CodeGen/X86/vec_shuffle-39.ll b/test/CodeGen/X86/vec_shuffle-39.ll deleted file mode 100644 index 8fd9a5cd023e..000000000000 --- a/test/CodeGen/X86/vec_shuffle-39.ll +++ /dev/null @@ -1,86 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn | FileCheck %s -; rdar://10050222, rdar://10134392 - -define <4 x float> @t1(<4 x float> %a, <1 x i64>* nocapture %p) nounwind { -entry: -; CHECK-LABEL: t1: -; CHECK: movlps (%rdi), %xmm0 -; CHECK: ret - %p.val = load <1 x i64>* %p, align 1 - %0 = bitcast <1 x i64> %p.val to <2 x float> - %shuffle.i = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> - %shuffle1.i = shufflevector <4 x float> %a, <4 x float> %shuffle.i, <4 x i32> <i32 4, i32 5, i32 2, i32 3> - ret <4 x float> %shuffle1.i -} - -define <4 x float> @t1a(<4 x float> %a, <1 x i64>* nocapture %p) nounwind { -entry: -; CHECK-LABEL: t1a: -; CHECK: movlps (%rdi), %xmm0 -; CHECK: ret - %0 = bitcast <1 x i64>* %p to double* - %1 = load double* %0 - %2 = insertelement <2 x double> undef, double %1, i32 0 - %3 = bitcast <2 x double> %2 to <4 x float> - %4 = shufflevector <4 x float> %a, <4 x float> %3, <4 x i32> <i32 4, i32 5, i32 2, i32 3> - ret <4 x float> %4 -} - -define void @t2(<1 x i64>* nocapture %p, <4 x float> %a) nounwind { -entry: -; CHECK-LABEL: t2: -; CHECK: movlps %xmm0, (%rdi) -; CHECK: ret - %cast.i = bitcast <4 x float> %a to <2 x i64> - %extract.i = extractelement <2 x i64> %cast.i, i32 0 - %0 = getelementptr inbounds <1 x i64>* %p, i64 0, i64 0 - store i64 %extract.i, i64* %0, align 8 - ret void -} - -define void @t2a(<1 x i64>* nocapture %p, <4 x float> %a) nounwind { -entry: -; CHECK-LABEL: t2a: -; CHECK: movlps %xmm0, (%rdi) -; CHECK: ret - %0 = bitcast <1 x i64>* %p to double* - %1 = bitcast <4 x float> %a to <2 x double> - %2 = extractelement <2 x double> %1, i32 0 - store double %2, double* %0 - ret void -} - -; rdar://10436044 -define <2 x double> @t3() nounwind readonly { -bb: -; CHECK-LABEL: t3: -; CHECK: movq (%rax), %xmm1 -; CHECK: punpcklqdq %xmm2, %xmm0 -; CHECK: movsd %xmm1, %xmm0 - %tmp0 = load i128* null, align 1 - %tmp1 = load <2 x i32>* undef, align 8 - %tmp2 = bitcast i128 %tmp0 to <16 x i8> - %tmp3 = bitcast <2 x i32> %tmp1 to i64 - %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0 - %tmp5 = bitcast <16 x i8> %tmp2 to <2 x double> - %tmp6 = bitcast <2 x i64> %tmp4 to <2 x double> - %tmp7 = shufflevector <2 x double> %tmp5, <2 x double> %tmp6, <2 x i32> <i32 2, i32 1> - ret <2 x double> %tmp7 -} - -; rdar://10450317 -define <2 x i64> @t4() nounwind readonly { -bb: -; CHECK-LABEL: t4: -; CHECK: movq (%rax), %xmm0 -; CHECK: punpcklqdq %{{xmm.}}, %[[XMM:xmm[0-9]]] -; CHECK: movsd %[[XMM]], %xmm0 - %tmp0 = load i128* null, align 1 - %tmp1 = load <2 x i32>* undef, align 8 - %tmp2 = bitcast i128 %tmp0 to <16 x i8> - %tmp3 = bitcast <2 x i32> %tmp1 to i64 - %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0 - %tmp5 = bitcast <16 x i8> %tmp2 to <2 x i64> - %tmp6 = shufflevector <2 x i64> %tmp4, <2 x i64> %tmp5, <2 x i32> <i32 2, i32 1> - ret <2 x i64> %tmp6 -} diff --git a/test/CodeGen/X86/vec_shuffle-40.ll b/test/CodeGen/X86/vec_shuffle-40.ll deleted file mode 100644 index 75b45e3df111..000000000000 --- a/test/CodeGen/X86/vec_shuffle-40.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s - -define void @shuffle_v16i16(<16 x i16>* %a) { -; CHECK-LABEL: shuffle_v16i16: -; CHECK: vpshufb {{.*}}%ymm -; CHECK-NOT: vpshufb {{.*}}%xmm -entry: - %0 = load <16 x i16>* %a, align 32 - %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> - store <16 x i16> %shuffle, <16 x i16>* %a, align 32 - ret void -} - -define void @shuffle_v16i16_lanecrossing(<16 x i16>* %a) { -; CHECK-LABEL: shuffle_v16i16_lanecrossing: -; CHECK-NOT: vpshufb {{.*}}%ymm -entry: - %0 = load <16 x i16>* %a, align 32 - %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 13, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> - store <16 x i16> %shuffle, <16 x i16>* %a, align 32 - ret void -} diff --git a/test/CodeGen/X86/vec_shuffle-41.ll b/test/CodeGen/X86/vec_shuffle-41.ll deleted file mode 100644 index 28fdd2f5ce17..000000000000 --- a/test/CodeGen/X86/vec_shuffle-41.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s - -; Use buildFromShuffleMostly which allows this to be generated as two 128-bit -; shuffles and an insert. - -; This is the (somewhat questionable) LLVM IR that is generated for: -; x8.s0123456 = x8.s1234567; // x8 is a <8 x float> type -; x8.s7 = f; // f is float - - -define <8 x float> @test1(<8 x float> %a, float %b) { -; CHECK-LABEL: test1: -; CHECK: vinsertps -; CHECK-NOT: vinsertps -entry: - %shift = shufflevector <8 x float> %a, <8 x float> undef, <7 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> - %extend = shufflevector <7 x float> %shift, <7 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef> - %insert = insertelement <8 x float> %extend, float %b, i32 7 - - ret <8 x float> %insert -} diff --git a/test/CodeGen/X86/vec_shuffle.ll b/test/CodeGen/X86/vec_shuffle.ll deleted file mode 100644 index 65995984859b..000000000000 --- a/test/CodeGen/X86/vec_shuffle.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: llc < %s -mtriple=i686-linux -mcpu=core2 | FileCheck %s - -; CHECK: test_v4sf -; CHECK: movq 8(%esp) -; CHECK: pshufd $80 -define void @test_v4sf(<4 x float>* %P, float %X, float %Y) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1] - %tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1] - %tmp4 = insertelement <4 x float> %tmp2, float %Y, i32 2 ; <<4 x float>> [#uses=1] - %tmp6 = insertelement <4 x float> %tmp4, float %Y, i32 3 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp6, <4 x float>* %P - ret void -} - -; CHECK: test_v2sd -; CHECK: movups 8(%esp) -; CHECK: movaps -define void @test_v2sd(<2 x double>* %P, double %X, double %Y) nounwind { - %tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1] - %tmp2 = insertelement <2 x double> %tmp, double %Y, i32 1 ; <<2 x double>> [#uses=1] - store <2 x double> %tmp2, <2 x double>* %P - ret void -} - -; CHECK: test_v8i16 -; CHECK: pshufhw $-58 -; CHECK: movdqa -define void @test_v8i16(<2 x i64>* %res, <2 x i64>* %A) nounwind { - %tmp = load <2 x i64>* %A ; <<2 x i64>> [#uses=1] - %tmp.upgrd.1 = bitcast <2 x i64> %tmp to <8 x i16> ; <<8 x i16>> [#uses=8] - %tmp.upgrd.2 = extractelement <8 x i16> %tmp.upgrd.1, i32 0 ; <i16> [#uses=1] - %tmp1 = extractelement <8 x i16> %tmp.upgrd.1, i32 1 ; <i16> [#uses=1] - %tmp2 = extractelement <8 x i16> %tmp.upgrd.1, i32 2 ; <i16> [#uses=1] - %tmp3 = extractelement <8 x i16> %tmp.upgrd.1, i32 3 ; <i16> [#uses=1] - %tmp4 = extractelement <8 x i16> %tmp.upgrd.1, i32 6 ; <i16> [#uses=1] - %tmp5 = extractelement <8 x i16> %tmp.upgrd.1, i32 5 ; <i16> [#uses=1] - %tmp6 = extractelement <8 x i16> %tmp.upgrd.1, i32 4 ; <i16> [#uses=1] - %tmp7 = extractelement <8 x i16> %tmp.upgrd.1, i32 7 ; <i16> [#uses=1] - %tmp8 = insertelement <8 x i16> undef, i16 %tmp.upgrd.2, i32 0 ; <<8 x i16>> [#uses=1] - %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 1 ; <<8 x i16>> [#uses=1] - %tmp10 = insertelement <8 x i16> %tmp9, i16 %tmp2, i32 2 ; <<8 x i16>> [#uses=1] - %tmp11 = insertelement <8 x i16> %tmp10, i16 %tmp3, i32 3 ; <<8 x i16>> [#uses=1] - %tmp12 = insertelement <8 x i16> %tmp11, i16 %tmp4, i32 4 ; <<8 x i16>> [#uses=1] - %tmp13 = insertelement <8 x i16> %tmp12, i16 %tmp5, i32 5 ; <<8 x i16>> [#uses=1] - %tmp14 = insertelement <8 x i16> %tmp13, i16 %tmp6, i32 6 ; <<8 x i16>> [#uses=1] - %tmp15 = insertelement <8 x i16> %tmp14, i16 %tmp7, i32 7 ; <<8 x i16>> [#uses=1] - %tmp15.upgrd.3 = bitcast <8 x i16> %tmp15 to <2 x i64> ; <<2 x i64>> [#uses=1] - store <2 x i64> %tmp15.upgrd.3, <2 x i64>* %res - ret void -} diff --git a/test/CodeGen/X86/vec_splat-2.ll b/test/CodeGen/X86/vec_splat-2.ll deleted file mode 100644 index 9d82f97dca1c..000000000000 --- a/test/CodeGen/X86/vec_splat-2.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s - -define void @test(<2 x i64>* %P, i8 %x) nounwind { - %tmp = insertelement <16 x i8> zeroinitializer, i8 %x, i32 0 ; <<16 x i8>> [#uses=1] - %tmp36 = insertelement <16 x i8> %tmp, i8 %x, i32 1 ; <<16 x i8>> [#uses=1] - %tmp38 = insertelement <16 x i8> %tmp36, i8 %x, i32 2 ; <<16 x i8>> [#uses=1] - %tmp40 = insertelement <16 x i8> %tmp38, i8 %x, i32 3 ; <<16 x i8>> [#uses=1] - %tmp42 = insertelement <16 x i8> %tmp40, i8 %x, i32 4 ; <<16 x i8>> [#uses=1] - %tmp44 = insertelement <16 x i8> %tmp42, i8 %x, i32 5 ; <<16 x i8>> [#uses=1] - %tmp46 = insertelement <16 x i8> %tmp44, i8 %x, i32 6 ; <<16 x i8>> [#uses=1] - %tmp48 = insertelement <16 x i8> %tmp46, i8 %x, i32 7 ; <<16 x i8>> [#uses=1] - %tmp50 = insertelement <16 x i8> %tmp48, i8 %x, i32 8 ; <<16 x i8>> [#uses=1] - %tmp52 = insertelement <16 x i8> %tmp50, i8 %x, i32 9 ; <<16 x i8>> [#uses=1] - %tmp54 = insertelement <16 x i8> %tmp52, i8 %x, i32 10 ; <<16 x i8>> [#uses=1] - %tmp56 = insertelement <16 x i8> %tmp54, i8 %x, i32 11 ; <<16 x i8>> [#uses=1] - %tmp58 = insertelement <16 x i8> %tmp56, i8 %x, i32 12 ; <<16 x i8>> [#uses=1] - %tmp60 = insertelement <16 x i8> %tmp58, i8 %x, i32 13 ; <<16 x i8>> [#uses=1] - %tmp62 = insertelement <16 x i8> %tmp60, i8 %x, i32 14 ; <<16 x i8>> [#uses=1] - %tmp64 = insertelement <16 x i8> %tmp62, i8 %x, i32 15 ; <<16 x i8>> [#uses=1] - %tmp68 = load <2 x i64>* %P ; <<2 x i64>> [#uses=1] - %tmp71 = bitcast <2 x i64> %tmp68 to <16 x i8> ; <<16 x i8>> [#uses=1] - %tmp73 = add <16 x i8> %tmp71, %tmp64 ; <<16 x i8>> [#uses=1] - %tmp73.upgrd.1 = bitcast <16 x i8> %tmp73 to <2 x i64> ; <<2 x i64>> [#uses=1] - store <2 x i64> %tmp73.upgrd.1, <2 x i64>* %P - ret void - -; CHECK-LABEL: test: -; CHECK-NOT: pshufd -; CHECK: punpcklbw -; CHECK: punpcklbw -; CHECK: pshufd $0 -; CHECK-NOT: pshufd -} diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll deleted file mode 100644 index 754cbf41867d..000000000000 --- a/test/CodeGen/X86/vec_splat-3.ll +++ /dev/null @@ -1,230 +0,0 @@ -; RUN: llc <%s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s - -; Splat test for v8i16 -define <8 x i16> @shuf_8i16_0(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_0: -; CHECK: pshuflw $0 -} - -define <8 x i16> @shuf_8i16_1(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_1: -; CHECK: pshuflw $5 -} - -define <8 x i16> @shuf_8i16_2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_2: -; CHECK: punpcklwd -; CHECK-NEXT: pshufd $-86 -} - -define <8 x i16> @shuf_8i16_3(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_3: -; CHECK: pshuflw $15 -} - -define <8 x i16> @shuf_8i16_4(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_4: -; CHECK: movhlps -} - -define <8 x i16> @shuf_8i16_5(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_5: -; CHECK: punpckhwd -; CHECK-NEXT: pshufd $85 -} - -define <8 x i16> @shuf_8i16_6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 6, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_6: -; CHECK: punpckhwd -; CHECK-NEXT: pshufd $-86 -} - -define <8 x i16> @shuf_8i16_7(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone { - %tmp6 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> <i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> - ret <8 x i16> %tmp6 - -; CHECK-LABEL: shuf_8i16_7: -; CHECK: punpckhwd -; CHECK-NEXT: pshufd $-1 -} - -; Splat test for v16i8 -define <16 x i8> @shuf_16i8_8(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_8: -; CHECK: punpcklbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $0 -} - -define <16 x i8> @shuf_16i8_9(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef > - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_9: -; CHECK: punpcklbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $85 -} - -define <16 x i8> @shuf_16i8_10(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_10: -; CHECK: punpcklbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $-86 -} - -define <16 x i8> @shuf_16i8_11(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 3, i32 undef, i32 undef, i32 3, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_11: -; CHECK: punpcklbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $-1 -} - - -define <16 x i8> @shuf_16i8_12(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef > - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_12: -; CHECK: pshufd $5 -} - -define <16 x i8> @shuf_16i8_13(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 5, i32 undef, i32 undef, i32 5, i32 undef, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_13: -; CHECK: punpcklbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $85 -} - -define <16 x i8> @shuf_16i8_14(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 6, i32 undef, i32 undef, i32 6, i32 undef, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_14: -; CHECK: punpcklbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $-86 -} - -define <16 x i8> @shuf_16i8_15(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 7, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef > - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_15: -; CHECK: punpcklbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $-1 -} - -define <16 x i8> @shuf_16i8_16(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 8, i32 undef, i32 undef, i32 8, i32 undef, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_16: -; CHECK: punpckhbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $0 -} - -define <16 x i8> @shuf_16i8_17(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 9, i32 undef, i32 undef, i32 9, i32 undef, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_17: -; CHECK: punpckhbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $85 -} - -define <16 x i8> @shuf_16i8_18(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 10, i32 undef, i32 undef, i32 10, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_18: -; CHECK: punpckhbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $-86 -} - -define <16 x i8> @shuf_16i8_19(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 11, i32 undef, i32 undef, i32 11, i32 undef, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11, i32 11> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_19: -; CHECK: punpckhbw -; CHECK-NEXT: punpcklbw -; CHECK-NEXT: pshufd $-1 -} - -define <16 x i8> @shuf_16i8_20(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 12, i32 undef, i32 undef, i32 12, i32 undef, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_20: -; CHECK: punpckhbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $0 -} - -define <16 x i8> @shuf_16i8_21(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 13, i32 undef, i32 undef, i32 13, i32 undef, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_21: -; CHECK: punpckhbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $85 -} - -define <16 x i8> @shuf_16i8_22(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 14, i32 undef, i32 undef, i32 14, i32 undef, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_22: -; CHECK: punpckhbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $-86 -} - -define <16 x i8> @shuf_16i8_23(<16 x i8> %T0, <16 x i8> %T1) nounwind readnone { - %tmp6 = shufflevector <16 x i8> %T0, <16 x i8> %T1, <16 x i32> <i32 15, i32 undef, i32 undef, i32 15, i32 undef, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> - ret <16 x i8> %tmp6 - -; CHECK-LABEL: shuf_16i8_23: -; CHECK: punpckhbw -; CHECK-NEXT: punpckhbw -; CHECK-NEXT: pshufd $-1 -} diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll deleted file mode 100644 index a02e3836078c..000000000000 --- a/test/CodeGen/X86/vec_splat.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse2 | FileCheck %s -check-prefix=SSE2 -; RUN: llc < %s -march=x86 -mcpu=pentium4 -mattr=+sse3 | FileCheck %s -check-prefix=SSE3 - -define void @test_v4sf(<4 x float>* %P, <4 x float>* %Q, float %X) nounwind { - %tmp = insertelement <4 x float> zeroinitializer, float %X, i32 0 ; <<4 x float>> [#uses=1] - %tmp2 = insertelement <4 x float> %tmp, float %X, i32 1 ; <<4 x float>> [#uses=1] - %tmp4 = insertelement <4 x float> %tmp2, float %X, i32 2 ; <<4 x float>> [#uses=1] - %tmp6 = insertelement <4 x float> %tmp4, float %X, i32 3 ; <<4 x float>> [#uses=1] - %tmp8 = load <4 x float>* %Q ; <<4 x float>> [#uses=1] - %tmp10 = fmul <4 x float> %tmp8, %tmp6 ; <<4 x float>> [#uses=1] - store <4 x float> %tmp10, <4 x float>* %P - ret void - -; SSE2-LABEL: test_v4sf: -; SSE2: pshufd $0 - -; SSE3-LABEL: test_v4sf: -; SSE3: pshufd $0 -} - -define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind { - %tmp = insertelement <2 x double> zeroinitializer, double %X, i32 0 ; <<2 x double>> [#uses=1] - %tmp2 = insertelement <2 x double> %tmp, double %X, i32 1 ; <<2 x double>> [#uses=1] - %tmp4 = load <2 x double>* %Q ; <<2 x double>> [#uses=1] - %tmp6 = fmul <2 x double> %tmp4, %tmp2 ; <<2 x double>> [#uses=1] - store <2 x double> %tmp6, <2 x double>* %P - ret void - -; SSE2-LABEL: test_v2sd: -; SSE2: shufpd $0 - -; SSE3-LABEL: test_v2sd: -; SSE3: movddup -} - -; Fold extract of a load into the load's address computation. This avoids spilling to the stack. -define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind { - %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i - %2 = load <4 x float>* %1, align 16 - %3 = extractelement <4 x float> %2, i64 %j - %4 = insertelement <4 x float> undef, float %3, i32 0 - %5 = insertelement <4 x float> %4, float %3, i32 1 - %6 = insertelement <4 x float> %5, float %3, i32 2 - %7 = insertelement <4 x float> %6, float %3, i32 3 - ret <4 x float> %7 - -; AVX-LABEL: load_extract_splat -; AVX-NOT: movs -; AVX: vbroadcastss -} diff --git a/test/CodeGen/X86/vec_trunc_sext.ll b/test/CodeGen/X86/vec_trunc_sext.ll new file mode 100644 index 000000000000..3c446bba4ea8 --- /dev/null +++ b/test/CodeGen/X86/vec_trunc_sext.ll @@ -0,0 +1,30 @@ +; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='-sse4.1' -o - | FileCheck %s -check-prefix=NO_SSE_41 +; RUN: llc %s -mtriple=x86_64-unknown-unknown -mattr='+sse4.1' -o - | FileCheck %s -check-prefix=SSE_41 + +; PR20472 ( http://llvm.org/bugs/show_bug.cgi?id=20472 ) +; When sexting a trunc'd vector value, we can't eliminate the zext. +; If we don't have SSE4.1, use punpck. +; If we have SSE4.1, use pmovzx because it combines the load op. +; There may be a better way to do this using pshufb + pmovsx, +; but that is beyond our current codegen capabilities. + +define <4 x i32> @trunc_sext(<4 x i16>* %in) { + %load = load <4 x i16>* %in + %trunc = trunc <4 x i16> %load to <4 x i8> + %sext = sext <4 x i8> %trunc to <4 x i32> + ret <4 x i32> %sext + +; NO_SSE_41-LABEL: trunc_sext: +; NO_SSE_41: movq (%rdi), %xmm0 +; NO_SSE_41-NEXT: punpcklwd %xmm0, %xmm0 +; NO_SSE_41-NEXT: pslld $24, %xmm0 +; NO_SSE_41-NEXT: psrad $24, %xmm0 +; NO_SSE_41-NEXT: retq + +; SSE_41-LABEL: trunc_sext: +; SSE_41: pmovzxwd (%rdi), %xmm0 +; SSE_41-NEXT: pslld $24, %xmm0 +; SSE_41-NEXT: psrad $24, %xmm0 +; SSE_41-NEXT: retq +} + diff --git a/test/CodeGen/X86/vec_uint_to_fp.ll b/test/CodeGen/X86/vec_uint_to_fp.ll index ee20f1fcbd04..46cfcd9a9a12 100644 --- a/test/CodeGen/X86/vec_uint_to_fp.ll +++ b/test/CodeGen/X86/vec_uint_to_fp.ll @@ -1,11 +1,167 @@ -; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck --check-prefix=CHECK --check-prefix=SSE --check-prefix=CST %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse4.1 | FileCheck --check-prefix=CHECK --check-prefix=SSE41 --check-prefix=CST %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck --check-prefix=CHECK --check-prefix=AVX --check-prefix=CST %s +; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx2 | FileCheck --check-prefix=CHECK --check-prefix=AVX2 %s + +; Check that the constant used in the vectors are the right ones. +; SSE: [[MASKCSTADDR:LCPI0_[0-9]+]]: +; SSE-NEXT: .long 65535 ## 0xffff +; SSE-NEXT: .long 65535 ## 0xffff +; SSE-NEXT: .long 65535 ## 0xffff +; SSE-NEXT: .long 65535 ## 0xffff + +; CST: [[LOWCSTADDR:LCPI0_[0-9]+]]: +; CST-NEXT: .long 1258291200 ## 0x4b000000 +; CST-NEXT: .long 1258291200 ## 0x4b000000 +; CST-NEXT: .long 1258291200 ## 0x4b000000 +; CST-NEXT: .long 1258291200 ## 0x4b000000 + +; CST: [[HIGHCSTADDR:LCPI0_[0-9]+]]: +; CST-NEXT: .long 1392508928 ## 0x53000000 +; CST-NEXT: .long 1392508928 ## 0x53000000 +; CST-NEXT: .long 1392508928 ## 0x53000000 +; CST-NEXT: .long 1392508928 ## 0x53000000 + +; CST: [[MAGICCSTADDR:LCPI0_[0-9]+]]: +; CST-NEXT: .long 3539992704 ## float -5.497642e+11 +; CST-NEXT: .long 3539992704 ## float -5.497642e+11 +; CST-NEXT: .long 3539992704 ## float -5.497642e+11 +; CST-NEXT: .long 3539992704 ## float -5.497642e+11 + +; AVX2: [[LOWCSTADDR:LCPI0_[0-9]+]]: +; AVX2-NEXT: .long 1258291200 ## 0x4b000000 + +; AVX2: [[HIGHCSTADDR:LCPI0_[0-9]+]]: +; AVX2-NEXT: .long 1392508928 ## 0x53000000 + +; AVX2: [[MAGICCSTADDR:LCPI0_[0-9]+]]: +; AVX2-NEXT: .long 3539992704 ## float -5.49764202E+11 -; Test that we are not lowering uinttofp to scalars define <4 x float> @test1(<4 x i32> %A) nounwind { ; CHECK-LABEL: test1: -; CHECK-NOT: cvtsd2ss -; CHECK: ret +; +; SSE: movdqa [[MASKCSTADDR]](%rip), [[MASK:%xmm[0-9]+]] +; SSE-NEXT: pand %xmm0, [[MASK]] +; After this instruction, MASK will have the value of the low parts +; of the vector. +; SSE-NEXT: por [[LOWCSTADDR]](%rip), [[MASK]] +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: por [[HIGHCSTADDR]](%rip), %xmm0 +; SSE-NEXT: addps [[MAGICCSTADDR]](%rip), %xmm0 +; SSE-NEXT: addps [[MASK]], %xmm0 +; SSE-NEXT: retq +; +; Currently we commute the arguments of the first blend, but this could be +; improved to match the lowering of the second blend. +; SSE41: movdqa [[LOWCSTADDR]](%rip), [[LOWVEC:%xmm[0-9]+]] +; SSE41-NEXT: pblendw $85, %xmm0, [[LOWVEC]] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pblendw $170, [[HIGHCSTADDR]](%rip), %xmm0 +; SSE41-NEXT: addps [[MAGICCSTADDR]](%rip), %xmm0 +; SSE41-NEXT: addps [[LOWVEC]], %xmm0 +; SSE41-NEXT: retq +; +; AVX: vpblendw $170, [[LOWCSTADDR]](%rip), %xmm0, [[LOWVEC:%xmm[0-9]+]] +; AVX-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] +; AVX-NEXT: vpblendw $170, [[HIGHCSTADDR]](%rip), [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] +; AVX-NEXT: vaddps [[MAGICCSTADDR]](%rip), [[HIGHVEC]], [[TMP:%xmm[0-9]+]] +; AVX-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 +; AVX-NEXT: retq +; +; The lowering for AVX2 is a bit messy, because we select broadcast +; instructions, instead of folding the constant loads. +; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%xmm[0-9]+]] +; AVX2-NEXT: vpblendw $170, [[LOWCST]], %xmm0, [[LOWVEC:%xmm[0-9]+]] +; AVX2-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] +; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%xmm[0-9]+]] +; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] +; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%xmm[0-9]+]] +; AVX2-NEXT: vaddps [[MAGICCST]], [[HIGHVEC]], [[TMP:%xmm[0-9]+]] +; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 +; AVX2-NEXT: retq %C = uitofp <4 x i32> %A to <4 x float> ret <4 x float> %C } +; Match the AVX2 constants used in the next function +; AVX2: [[LOWCSTADDR:LCPI1_[0-9]+]]: +; AVX2-NEXT: .long 1258291200 ## 0x4b000000 + +; AVX2: [[HIGHCSTADDR:LCPI1_[0-9]+]]: +; AVX2-NEXT: .long 1392508928 ## 0x53000000 + +; AVX2: [[MAGICCSTADDR:LCPI1_[0-9]+]]: +; AVX2-NEXT: .long 3539992704 ## float -5.49764202E+11 + +define <8 x float> @test2(<8 x i32> %A) nounwind { +; CHECK-LABEL: test2: +; Legalization will break the thing is 2 x <4 x i32> on anthing prior AVX. +; The constant used for in the vector instruction are shared between the +; two sequences of instructions. +; +; SSE: movdqa {{.*#+}} [[MASK:xmm[0-9]+]] = [65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] +; SSE-NEXT: pand %[[MASK]], [[VECLOW]] +; SSE-NEXT: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] +; SSE-NEXT: por %[[LOWCST]], [[VECLOW]] +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] +; SSE-NEXT: por %[[HIGHCST]], %xmm0 +; SSE-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11] +; SSE-NEXT: addps %[[MAGICCST]], %xmm0 +; SSE-NEXT: addps [[VECLOW]], %xmm0 +; MASK is the low vector of the second part after this point. +; SSE-NEXT: pand %xmm1, %[[MASK]] +; SSE-NEXT: por %[[LOWCST]], %[[MASK]] +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: por %[[HIGHCST]], %xmm1 +; SSE-NEXT: addps %[[MAGICCST]], %xmm1 +; SSE-NEXT: addps %[[MASK]], %xmm1 +; SSE-NEXT: retq +; +; SSE41: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] +; SSE41-NEXT: pblendw $170, %[[LOWCST]], [[VECLOW]] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] +; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm0 +; SSE41-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [-5.497642e+11,-5.497642e+11,-5.497642e+11,-5.497642e+11] +; SSE41-NEXT: addps %[[MAGICCST]], %xmm0 +; SSE41-NEXT: addps [[VECLOW]], %xmm0 +; LOWCST is the low vector of the second part after this point. +; The operands of the blend are inverted because we reuse xmm1 +; in the next shift. +; SSE41-NEXT: pblendw $85, %xmm1, %[[LOWCST]] +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm1 +; SSE41-NEXT: addps %[[MAGICCST]], %xmm1 +; SSE41-NEXT: addps %[[LOWCST]], %xmm1 +; SSE41-NEXT: retq +; +; Test that we are not lowering uinttofp to scalars +; AVX-NOT: cvtsd2ss +; AVX: retq +; +; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%ymm[0-9]+]] +; AVX2-NEXT: vpblendw $170, [[LOWCST]], %ymm0, [[LOWVEC:%ymm[0-9]+]] +; AVX2-NEXT: vpsrld $16, %ymm0, [[SHIFTVEC:%ymm[0-9]+]] +; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%ymm[0-9]+]] +; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%ymm[0-9]+]] +; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%ymm[0-9]+]] +; AVX2-NEXT: vaddps [[MAGICCST]], [[HIGHVEC]], [[TMP:%ymm[0-9]+]] +; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %ymm0 +; AVX2-NEXT: retq + %C = uitofp <8 x i32> %A to <8 x float> + ret <8 x float> %C +} + +define <4 x double> @test3(<4 x i32> %arg) { +; CHECK-LABEL: test3: +; This test used to crash because we were custom lowering it as if it was +; a conversion between <4 x i32> and <4 x float>. +; AVX: vcvtdq2pd +; AVX2: vcvtdq2pd +; CHECK: retq + %tmp = uitofp <4 x i32> %arg to <4 x double> + ret <4 x double> %tmp +} diff --git a/test/CodeGen/X86/vec_unsafe-fp-math.ll b/test/CodeGen/X86/vec_unsafe-fp-math.ll new file mode 100644 index 000000000000..827d4184d111 --- /dev/null +++ b/test/CodeGen/X86/vec_unsafe-fp-math.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -enable-unsafe-fp-math -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s + +; Make sure that vectors get the same benefits as scalars when using unsafe-fp-math. + +; Subtracting zero is free. +define <4 x float> @vec_fsub_zero(<4 x float> %x) { +; CHECK-LABEL: vec_fsub_zero: +; CHECK-NOT: subps +; CHECK-NOT: xorps +; CHECK: retq + %sub = fsub <4 x float> %x, zeroinitializer + ret <4 x float> %sub +} + +; Negating doesn't require subtraction. +define <4 x float> @vec_fneg(<4 x float> %x) { +; CHECK-LABEL: vec_fneg: +; CHECK: xorps {{.*}}LCP{{.*}}, %xmm0 +; CHECK-NOT: subps +; CHECK-NEXT: retq + %sub = fsub <4 x float> zeroinitializer, %x + ret <4 x float> %sub +} diff --git a/test/CodeGen/X86/vec_zext.ll b/test/CodeGen/X86/vec_zext.ll deleted file mode 100644 index 615a50b7afc3..000000000000 --- a/test/CodeGen/X86/vec_zext.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc < %s -march=x86-64 -; PR 9267 - -define<4 x i32> @func_16_32() { - %F = load <4 x i16>* undef - %G = zext <4 x i16> %F to <4 x i32> - %H = load <4 x i16>* undef - %Y = zext <4 x i16> %H to <4 x i32> - %T = add <4 x i32> %Y, %G - store <4 x i32>%T , <4 x i32>* undef - ret <4 x i32> %T -} - -define<4 x i64> @func_16_64() { - %F = load <4 x i16>* undef - %G = zext <4 x i16> %F to <4 x i64> - %H = load <4 x i16>* undef - %Y = zext <4 x i16> %H to <4 x i64> - %T = xor <4 x i64> %Y, %G - store <4 x i64>%T , <4 x i64>* undef - ret <4 x i64> %T -} - -define<4 x i64> @func_32_64() { - %F = load <4 x i32>* undef - %G = zext <4 x i32> %F to <4 x i64> - %H = load <4 x i32>* undef - %Y = zext <4 x i32> %H to <4 x i64> - %T = or <4 x i64> %Y, %G - ret <4 x i64> %T -} - -define<4 x i16> @func_8_16() { - %F = load <4 x i8>* undef - %G = zext <4 x i8> %F to <4 x i16> - %H = load <4 x i8>* undef - %Y = zext <4 x i8> %H to <4 x i16> - %T = add <4 x i16> %Y, %G - ret <4 x i16> %T -} - -define<4 x i32> @func_8_32() { - %F = load <4 x i8>* undef - %G = zext <4 x i8> %F to <4 x i32> - %H = load <4 x i8>* undef - %Y = zext <4 x i8> %H to <4 x i32> - %T = sub <4 x i32> %Y, %G - ret <4 x i32> %T -} - -define<4 x i64> @func_8_64() { - %F = load <4 x i8>* undef - %G = zext <4 x i8> %F to <4 x i64> - %H = load <4 x i8>* undef - %Y = zext <4 x i8> %H to <4 x i64> - %T = add <4 x i64> %Y, %G - ret <4 x i64> %T -} - -define<4 x i32> @const_16_32() { - %G = zext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i32> - ret <4 x i32> %G -} - -define<4 x i64> @const_16_64() { - %G = zext <4 x i16> <i16 0, i16 3, i16 8, i16 7> to <4 x i64> - ret <4 x i64> %G -} - diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll new file mode 100644 index 000000000000..f23b82883858 --- /dev/null +++ b/test/CodeGen/X86/vector-blend.ll @@ -0,0 +1,801 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 + +; AVX128 tests: + +define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) { +; SSE2-LABEL: vsel_float: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_float: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_float: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: vsel_float: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX-NEXT: retq +entry: + %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2 + ret <4 x float> %vsel +} + +define <4 x float> @vsel_float2(<4 x float> %v1, <4 x float> %v2) { +; SSE2-LABEL: vsel_float2: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_float2: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_float2: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: vsel_float2: +; AVX: # BB#0: # %entry +; AVX-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: retq +entry: + %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2 + ret <4 x float> %vsel +} + +define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { +; SSE2-LABEL: vsel_4xi8: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_4xi8: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_4xi8: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: vsel_4xi8: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_4xi8: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2-NEXT: retq +entry: + %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2 + ret <4 x i8> %vsel +} + +define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) { +; SSE2-LABEL: vsel_4xi16: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_4xi16: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_4xi16: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: vsel_4xi16: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_4xi16: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX2-NEXT: retq +entry: + %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2 + ret <4 x i16> %vsel +} + +define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) { +; SSE2-LABEL: vsel_i32: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_i32: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_i32: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: vsel_i32: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i32: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq +entry: + %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2 + ret <4 x i32> %vsel +} + +define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) { +; SSE2-LABEL: vsel_double: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_double: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_double: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: vsel_double: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX-NEXT: retq +entry: + %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2 + ret <2 x double> %vsel +} + +define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) { +; SSE2-LABEL: vsel_i64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_i64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_i64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: vsel_i64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq +entry: + %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2 + ret <2 x i64> %vsel +} + +define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) { +; SSE2-LABEL: vsel_8xi16: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_8xi16: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_8xi16: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: vsel_8xi16: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: retq +entry: + %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2 + ret <8 x i16> %vsel +} + +define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) { +; SSE2-LABEL: vsel_i8: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: andps {{.*}}(%rip), %xmm1 +; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_i8: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1 +; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_i8: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE41-NEXT: pblendvb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: vsel_i8: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +entry: + %vsel = select <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <16 x i8> %v1, <16 x i8> %v2 + ret <16 x i8> %vsel +} + + +; AVX256 tests: + +define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) { +; SSE2-LABEL: vsel_float8: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movss %xmm0, %xmm2 +; SSE2-NEXT: movss %xmm1, %xmm3 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_float8: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movss %xmm0, %xmm2 +; SSSE3-NEXT: movss %xmm1, %xmm3 +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_float8: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: vsel_float8: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX-NEXT: retq +entry: + %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2 + ret <8 x float> %vsel +} + +define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) { +; SSE2-LABEL: vsel_i328: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movss %xmm0, %xmm2 +; SSE2-NEXT: movss %xmm1, %xmm3 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_i328: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movss %xmm0, %xmm2 +; SSSE3-NEXT: movss %xmm1, %xmm3 +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_i328: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: vsel_i328: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i328: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq +entry: + %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2 + ret <8 x i32> %vsel +} + +define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) { +; SSE2-LABEL: vsel_double8: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm0, %xmm4 +; SSE2-NEXT: movsd %xmm2, %xmm6 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm5, %xmm1 +; SSE2-NEXT: movaps %xmm6, %xmm2 +; SSE2-NEXT: movaps %xmm7, %xmm3 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_double8: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm0, %xmm4 +; SSSE3-NEXT: movsd %xmm2, %xmm6 +; SSSE3-NEXT: movaps %xmm4, %xmm0 +; SSSE3-NEXT: movaps %xmm5, %xmm1 +; SSSE3-NEXT: movaps %xmm6, %xmm2 +; SSSE3-NEXT: movaps %xmm7, %xmm3 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_double8: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1] +; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1] +; SSE41-NEXT: movaps %xmm5, %xmm1 +; SSE41-NEXT: movaps %xmm7, %xmm3 +; SSE41-NEXT: retq +; +; AVX-LABEL: vsel_double8: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3] +; AVX-NEXT: retq +entry: + %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2 + ret <8 x double> %vsel +} + +define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) { +; SSE2-LABEL: vsel_i648: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm0, %xmm4 +; SSE2-NEXT: movsd %xmm2, %xmm6 +; SSE2-NEXT: movaps %xmm4, %xmm0 +; SSE2-NEXT: movaps %xmm5, %xmm1 +; SSE2-NEXT: movaps %xmm6, %xmm2 +; SSE2-NEXT: movaps %xmm7, %xmm3 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_i648: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm0, %xmm4 +; SSSE3-NEXT: movsd %xmm2, %xmm6 +; SSSE3-NEXT: movaps %xmm4, %xmm0 +; SSSE3-NEXT: movaps %xmm5, %xmm1 +; SSSE3-NEXT: movaps %xmm6, %xmm2 +; SSSE3-NEXT: movaps %xmm7, %xmm3 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_i648: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: movaps %xmm5, %xmm1 +; SSE41-NEXT: movaps %xmm7, %xmm3 +; SSE41-NEXT: retq +; +; AVX1-LABEL: vsel_i648: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3] +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i648: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7] +; AVX2-NEXT: retq +entry: + %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2 + ret <8 x i64> %vsel +} + +define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) { +; SSE2-LABEL: vsel_double4: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm0, %xmm2 +; SSE2-NEXT: movsd %xmm1, %xmm3 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: vsel_double4: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm0, %xmm2 +; SSSE3-NEXT: movsd %xmm1, %xmm3 +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: vsel_double4: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: vsel_double4: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX-NEXT: retq +entry: + %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2 + ret <4 x double> %vsel +} + +define <2 x double> @testa(<2 x double> %x, <2 x double> %y) { +; SSE2-LABEL: testa: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: cmplepd %xmm0, %xmm2 +; SSE2-NEXT: andpd %xmm2, %xmm0 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: testa: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movapd %xmm1, %xmm2 +; SSSE3-NEXT: cmplepd %xmm0, %xmm2 +; SSSE3-NEXT: andpd %xmm2, %xmm0 +; SSSE3-NEXT: andnpd %xmm1, %xmm2 +; SSSE3-NEXT: orpd %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testa: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmplepd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testa: +; AVX: # BB#0: # %entry +; AVX-NEXT: vcmplepd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +entry: + %max_is_x = fcmp oge <2 x double> %x, %y + %max = select <2 x i1> %max_is_x, <2 x double> %x, <2 x double> %y + ret <2 x double> %max +} + +define <2 x double> @testb(<2 x double> %x, <2 x double> %y) { +; SSE2-LABEL: testb: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: cmpnlepd %xmm0, %xmm2 +; SSE2-NEXT: andpd %xmm2, %xmm0 +; SSE2-NEXT: andnpd %xmm1, %xmm2 +; SSE2-NEXT: orpd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: testb: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movapd %xmm1, %xmm2 +; SSSE3-NEXT: cmpnlepd %xmm0, %xmm2 +; SSSE3-NEXT: andpd %xmm2, %xmm0 +; SSSE3-NEXT: andnpd %xmm1, %xmm2 +; SSSE3-NEXT: orpd %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testb: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movapd %xmm0, %xmm2 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: cmpnlepd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testb: +; AVX: # BB#0: # %entry +; AVX-NEXT: vcmpnlepd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +entry: + %min_is_x = fcmp ult <2 x double> %x, %y + %min = select <2 x i1> %min_is_x, <2 x double> %x, <2 x double> %y + ret <2 x double> %min +} + +; If we can figure out a blend has a constant mask, we should emit the +; blend instruction with an immediate mask +define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) { +; SSE2-LABEL: constant_blendvpd_avx: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm1, %xmm3 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: constant_blendvpd_avx: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm1, %xmm3 +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: constant_blendvpd_avx: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1] +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: constant_blendvpd_avx: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3] +; AVX-NEXT: retq +entry: + %select = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab + ret <4 x double> %select +} + +define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) { +; SSE2-LABEL: constant_blendvps_avx: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0] +; SSE2-NEXT: andps %xmm4, %xmm2 +; SSE2-NEXT: movaps {{.*#+}} xmm5 = [0,0,0,4294967295] +; SSE2-NEXT: andps %xmm5, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm4, %xmm3 +; SSE2-NEXT: andps %xmm5, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: constant_blendvps_avx: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movaps {{.*#+}} xmm4 = [4294967295,4294967295,4294967295,0] +; SSSE3-NEXT: andps %xmm4, %xmm2 +; SSSE3-NEXT: movaps {{.*#+}} xmm5 = [0,0,0,4294967295] +; SSSE3-NEXT: andps %xmm5, %xmm0 +; SSSE3-NEXT: orps %xmm2, %xmm0 +; SSSE3-NEXT: andps %xmm4, %xmm3 +; SSSE3-NEXT: andps %xmm5, %xmm1 +; SSSE3-NEXT: orps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: constant_blendvps_avx: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: constant_blendvps_avx: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] +; AVX-NEXT: retq +entry: + %select = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd + ret <8 x float> %select +} + +define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) { +; SSE2-LABEL: constant_pblendvb_avx2: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; SSE2-NEXT: andps %xmm4, %xmm2 +; SSE2-NEXT: movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; SSE2-NEXT: andps %xmm5, %xmm0 +; SSE2-NEXT: orps %xmm2, %xmm0 +; SSE2-NEXT: andps %xmm4, %xmm3 +; SSE2-NEXT: andps %xmm5, %xmm1 +; SSE2-NEXT: orps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: constant_pblendvb_avx2: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; SSSE3-NEXT: andps %xmm4, %xmm2 +; SSSE3-NEXT: movaps {{.*#+}} xmm5 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; SSSE3-NEXT: andps %xmm5, %xmm0 +; SSSE3-NEXT: orps %xmm2, %xmm0 +; SSSE3-NEXT: andps %xmm4, %xmm3 +; SSSE3-NEXT: andps %xmm5, %xmm1 +; SSSE3-NEXT: orps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: constant_pblendvb_avx2: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; SSE41-NEXT: pblendvb %xmm4, %xmm2 +; SSE41-NEXT: pblendvb %xmm1, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm3, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: constant_pblendvb_avx2: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_pblendvb_avx2: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq +entry: + %select = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd + ret <32 x i8> %select +} + +declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) + +;; 4 tests for shufflevectors that optimize to blend + immediate +define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: blend_shufflevector_4xfloat: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: blend_shufflevector_4xfloat: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: blend_shufflevector_4xfloat: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: blend_shufflevector_4xfloat: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX-NEXT: retq +entry: + %select = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + ret <4 x float> %select +} + +define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) { +; SSE2-LABEL: blend_shufflevector_8xfloat: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movss %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: blend_shufflevector_8xfloat: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movss %xmm0, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: blend_shufflevector_8xfloat: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: blend_shufflevector_8xfloat: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] +; AVX-NEXT: retq +entry: + %select = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 6, i32 15> + ret <8 x float> %select +} + +define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) { +; SSE2-LABEL: blend_shufflevector_4xdouble: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm0, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: blend_shufflevector_4xdouble: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm0, %xmm2 +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: blend_shufflevector_4xdouble: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: blend_shufflevector_4xdouble: +; AVX: # BB#0: # %entry +; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX-NEXT: retq +entry: + %select = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + ret <4 x double> %select +} + +define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) { +; SSE2-LABEL: blend_shufflevector_4xi64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsd %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: blend_shufflevector_4xi64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsd %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm3, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: blend_shufflevector_4xi64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: blend_shufflevector_4xi64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_shufflevector_4xi64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] +; AVX2-NEXT: retq +entry: + %select = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> + ret <4 x i64> %select +} diff --git a/test/CodeGen/X86/vector-ctpop.ll b/test/CodeGen/X86/vector-ctpop.ll new file mode 100644 index 000000000000..7091927a9006 --- /dev/null +++ b/test/CodeGen/X86/vector-ctpop.ll @@ -0,0 +1,159 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck -check-prefix=AVX2 %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=-popcnt | FileCheck -check-prefix=AVX1-NOPOPCNT %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=-popcnt | FileCheck -check-prefix=AVX2-NOPOPCNT %s + +; Vector version of: +; v = v - ((v >> 1) & 0x55555555) +; v = (v & 0x33333333) + ((v >> 2) & 0x33333333) +; v = (v + (v >> 4) & 0xF0F0F0F) +; v = v + (v >> 8) +; v = v + (v >> 16) +; v = v + (v >> 32) ; i64 only + +define <8 x i32> @test0(<8 x i32> %x) { +; AVX2-LABEL: @test0 +entry: +; AVX2: vpsrld $1, %ymm +; AVX2-NEXT: vpbroadcastd +; AVX2-NEXT: vpand +; AVX2-NEXT: vpsubd +; AVX2-NEXT: vpbroadcastd +; AVX2-NEXT: vpand +; AVX2-NEXT: vpsrld $2 +; AVX2-NEXT: vpand +; AVX2-NEXT: vpaddd +; AVX2-NEXT: vpsrld $4 +; AVX2-NEXT: vpaddd +; AVX2-NEXT: vpbroadcastd +; AVX2-NEXT: vpand +; AVX2-NEXT: vpsrld $8 +; AVX2-NEXT: vpaddd +; AVX2-NEXT: vpsrld $16 +; AVX2-NEXT: vpaddd +; AVX2-NEXT: vpbroadcastd +; AVX2-NEXT: vpand + %y = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %x) + ret <8 x i32> %y +} + +define <4 x i64> @test1(<4 x i64> %x) { +; AVX2-NOPOPCNT-LABEL: @test1 +entry: +; AVX2-NOPOPCNT: vpsrlq $1, %ymm +; AVX2-NOPOPCNT-NEXT: vpbroadcastq +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsubq +; AVX2-NOPOPCNT-NEXT: vpbroadcastq +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsrlq $2 +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpsrlq $4 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpbroadcastq +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsrlq $8 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpsrlq $16 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpsrlq $32 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpbroadcastq +; AVX2-NOPOPCNT-NEXT: vpand + %y = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %x) + ret <4 x i64> %y +} + +define <4 x i32> @test2(<4 x i32> %x) { +; AVX2-NOPOPCNT-LABEL: @test2 +; AVX1-NOPOPCNT-LABEL: @test2 +entry: +; AVX2-NOPOPCNT: vpsrld $1, %xmm +; AVX2-NOPOPCNT-NEXT: vpbroadcastd +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsubd +; AVX2-NOPOPCNT-NEXT: vpbroadcastd +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsrld $2 +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpaddd +; AVX2-NOPOPCNT-NEXT: vpsrld $4 +; AVX2-NOPOPCNT-NEXT: vpaddd +; AVX2-NOPOPCNT-NEXT: vpbroadcastd +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsrld $8 +; AVX2-NOPOPCNT-NEXT: vpaddd +; AVX2-NOPOPCNT-NEXT: vpsrld $16 +; AVX2-NOPOPCNT-NEXT: vpaddd +; AVX2-NOPOPCNT-NEXT: vpbroadcastd +; AVX2-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT: vpsrld $1, %xmm +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsubd +; AVX1-NOPOPCNT-NEXT: vmovdqa +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsrld $2 +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpaddd +; AVX1-NOPOPCNT-NEXT: vpsrld $4 +; AVX1-NOPOPCNT-NEXT: vpaddd +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsrld $8 +; AVX1-NOPOPCNT-NEXT: vpaddd +; AVX1-NOPOPCNT-NEXT: vpsrld $16 +; AVX1-NOPOPCNT-NEXT: vpaddd +; AVX1-NOPOPCNT-NEXT: vpand + %y = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x) + ret <4 x i32> %y +} + +define <2 x i64> @test3(<2 x i64> %x) { +; AVX2-NOPOPCNT-LABEL: @test3 +; AVX1-NOPOPCNT-LABEL: @test3 +entry: +; AVX2-NOPOPCNT: vpsrlq $1, %xmm +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsubq +; AVX2-NOPOPCNT-NEXT: vmovdqa +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsrlq $2 +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpsrlq $4 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpand +; AVX2-NOPOPCNT-NEXT: vpsrlq $8 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpsrlq $16 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpsrlq $32 +; AVX2-NOPOPCNT-NEXT: vpaddq +; AVX2-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT: vpsrlq $1, %xmm +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsubq +; AVX1-NOPOPCNT-NEXT: vmovdqa +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsrlq $2 +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpsrlq $4 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpand +; AVX1-NOPOPCNT-NEXT: vpsrlq $8 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpsrlq $16 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpsrlq $32 +; AVX1-NOPOPCNT-NEXT: vpaddq +; AVX1-NOPOPCNT-NEXT: vpand + %y = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x) + ret <2 x i64> %y +} + +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) + +declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) +declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) + diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll index a3229073751b..4b269dc923c4 100644 --- a/test/CodeGen/X86/vector-idiv.ll +++ b/test/CodeGen/X86/vector-idiv.ll @@ -1,218 +1,1255 @@ -; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=SSE41 -; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE -; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX +; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=SSE41 +; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s --check-prefix=AVX -define <4 x i32> @test1(<4 x i32> %a) { - %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> - ret <4 x i32> %div +target triple = "x86_64-unknown-unknown" +define <4 x i32> @test1(<4 x i32> %a) { ; SSE41-LABEL: test1: -; SSE41: pmuludq -; SSE41: pshufd $49 -; SSE41: pmuludq -; SSE41: shufps $-35 -; SSE41: psubd -; SSE41: psrld $1 -; SSE41: padd -; SSE41: psrld $2 - +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pmuludq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm1, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE41-NEXT: psubd %xmm2, %xmm0 +; SSE41-NEXT: psrld $1, %xmm0 +; SSE41-NEXT: paddd %xmm2, %xmm0 +; SSE41-NEXT: psrld $2, %xmm0 +; SSE41-NEXT: retq +; +; SSE-LABEL: test1: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: psubd %xmm2, %xmm0 +; SSE-NEXT: psrld $1, %xmm0 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: psrld $2, %xmm0 +; SSE-NEXT: retq +; ; AVX-LABEL: test1: -; AVX: vpmuludq -; AVX: vpshufd $49 -; AVX: vpmuludq -; AVX: vshufps $-35 -; AVX: vpsubd -; AVX: vpsrld $1 -; AVX: vpadd -; AVX: vpsrld $2 +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 +; AVX-NEXT: retq + %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> + ret <4 x i32> %div } define <8 x i32> @test2(<8 x i32> %a) { +; SSE41-LABEL: test2: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm4, %xmm5 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE41-NEXT: psubd %xmm3, %xmm0 +; SSE41-NEXT: psrld $1, %xmm0 +; SSE41-NEXT: paddd %xmm3, %xmm0 +; SSE41-NEXT: psrld $2, %xmm0 +; SSE41-NEXT: pmuludq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm4, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE41-NEXT: psubd %xmm2, %xmm1 +; SSE41-NEXT: psrld $1, %xmm1 +; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: psrld $2, %xmm1 +; SSE41-NEXT: retq +; +; SSE-LABEL: test2: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: psubd %xmm3, %xmm0 +; SSE-NEXT: psrld $1, %xmm0 +; SSE-NEXT: paddd %xmm3, %xmm0 +; SSE-NEXT: psrld $2, %xmm0 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: psubd %xmm2, %xmm1 +; SSE-NEXT: psrld $1, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: psrld $2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: test2: +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsrld $1, %ymm0, %ymm0 +; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsrld $2, %ymm0, %ymm0 +; AVX-NEXT: retq %div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7> ret <8 x i32> %div - -; AVX-LABEL: test2: -; AVX: vpbroadcastd -; AVX: vpalignr $4 -; AVX: vpmuludq -; AVX: vpmuludq -; AVX: vpblendd $170 -; AVX: vpsubd -; AVX: vpsrld $1 -; AVX: vpadd -; AVX: vpsrld $2 } define <8 x i16> @test3(<8 x i16> %a) { - %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> - ret <8 x i16> %div - ; SSE41-LABEL: test3: -; SSE41: pmulhuw -; SSE41: psubw -; SSE41: psrlw $1 -; SSE41: paddw -; SSE41: psrlw $2 - +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] +; SSE41-NEXT: pmulhuw %xmm0, %xmm1 +; SSE41-NEXT: psubw %xmm1, %xmm0 +; SSE41-NEXT: psrlw $1, %xmm0 +; SSE41-NEXT: paddw %xmm1, %xmm0 +; SSE41-NEXT: psrlw $2, %xmm0 +; SSE41-NEXT: retq +; +; SSE-LABEL: test3: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363] +; SSE-NEXT: pmulhuw %xmm0, %xmm1 +; SSE-NEXT: psubw %xmm1, %xmm0 +; SSE-NEXT: psrlw $1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: psrlw $2, %xmm0 +; SSE-NEXT: retq +; ; AVX-LABEL: test3: -; AVX: vpmulhuw -; AVX: vpsubw -; AVX: vpsrlw $1 -; AVX: vpaddw -; AVX: vpsrlw $2 +; AVX: # BB#0: +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 +; AVX-NEXT: retq + %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> + ret <8 x i16> %div } define <16 x i16> @test4(<16 x i16> %a) { +; SSE41-LABEL: test4: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pmulhuw %xmm2, %xmm3 +; SSE41-NEXT: psubw %xmm3, %xmm0 +; SSE41-NEXT: psrlw $1, %xmm0 +; SSE41-NEXT: paddw %xmm3, %xmm0 +; SSE41-NEXT: psrlw $2, %xmm0 +; SSE41-NEXT: pmulhuw %xmm1, %xmm2 +; SSE41-NEXT: psubw %xmm2, %xmm1 +; SSE41-NEXT: psrlw $1, %xmm1 +; SSE41-NEXT: paddw %xmm2, %xmm1 +; SSE41-NEXT: psrlw $2, %xmm1 +; SSE41-NEXT: retq +; +; SSE-LABEL: test4: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pmulhuw %xmm2, %xmm3 +; SSE-NEXT: psubw %xmm3, %xmm0 +; SSE-NEXT: psrlw $1, %xmm0 +; SSE-NEXT: paddw %xmm3, %xmm0 +; SSE-NEXT: psrlw $2, %xmm0 +; SSE-NEXT: pmulhuw %xmm1, %xmm2 +; SSE-NEXT: psubw %xmm2, %xmm1 +; SSE-NEXT: psrlw $1, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: psrlw $2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: test4: +; AVX: # BB#0: +; AVX-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1 +; AVX-NEXT: vpsubw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsrlw $2, %ymm0, %ymm0 +; AVX-NEXT: retq %div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7> ret <16 x i16> %div - -; AVX-LABEL: test4: -; AVX: vpmulhuw -; AVX: vpsubw -; AVX: vpsrlw $1 -; AVX: vpaddw -; AVX: vpsrlw $2 -; AVX-NOT: vpmulhuw } define <8 x i16> @test5(<8 x i16> %a) { - %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> - ret <8 x i16> %div - ; SSE41-LABEL: test5: -; SSE41: pmulhw -; SSE41: psrlw $15 -; SSE41: psraw $1 -; SSE41: paddw - +; SSE41: # BB#0: +; SSE41-NEXT: pmulhw {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlw $15, %xmm1 +; SSE41-NEXT: psraw $1, %xmm0 +; SSE41-NEXT: paddw %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; SSE-LABEL: test5: +; SSE: # BB#0: +; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrlw $15, %xmm1 +; SSE-NEXT: psraw $1, %xmm0 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: retq +; ; AVX-LABEL: test5: -; AVX: vpmulhw -; AVX: vpsrlw $15 -; AVX: vpsraw $1 -; AVX: vpaddw +; AVX: # BB#0: +; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1 +; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> + ret <8 x i16> %div } define <16 x i16> @test6(<16 x i16> %a) { +; SSE41-LABEL: test6: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] +; SSE41-NEXT: pmulhw %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrlw $15, %xmm3 +; SSE41-NEXT: psraw $1, %xmm0 +; SSE41-NEXT: paddw %xmm3, %xmm0 +; SSE41-NEXT: pmulhw %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrlw $15, %xmm2 +; SSE41-NEXT: psraw $1, %xmm1 +; SSE41-NEXT: paddw %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; SSE-LABEL: test6: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725] +; SSE-NEXT: pmulhw %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrlw $15, %xmm3 +; SSE-NEXT: psraw $1, %xmm0 +; SSE-NEXT: paddw %xmm3, %xmm0 +; SSE-NEXT: pmulhw %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrlw $15, %xmm2 +; SSE-NEXT: psraw $1, %xmm1 +; SSE-NEXT: paddw %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: test6: +; AVX: # BB#0: +; AVX-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vpsrlw $15, %ymm0, %ymm1 +; AVX-NEXT: vpsraw $1, %ymm0, %ymm0 +; AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq %div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7> ret <16 x i16> %div - -; AVX-LABEL: test6: -; AVX: vpmulhw -; AVX: vpsrlw $15 -; AVX: vpsraw $1 -; AVX: vpaddw -; AVX-NOT: vpmulhw } define <16 x i8> @test7(<16 x i8> %a) { - %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> - ret <16 x i8> %div - -; FIXME: scalarized ; SSE41-LABEL: test7: -; SSE41: pext +; SSE41: # BB#0: +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pextrb $0, %xmm0, %ecx +; SSE41-NEXT: movsbl %cl, %ecx +; SSE41-NEXT: imull $-109, %ecx, %edx +; SSE41-NEXT: shrl $8, %edx +; SSE41-NEXT: addb %dl, %cl +; SSE41-NEXT: movb %cl, %dl +; SSE41-NEXT: shrb $7, %dl +; SSE41-NEXT: sarb $2, %cl +; SSE41-NEXT: addb %dl, %cl +; SSE41-NEXT: movzbl %cl, %ecx +; SSE41-NEXT: movd %ecx, %xmm1 +; SSE41-NEXT: pinsrb $1, %eax, %xmm1 +; SSE41-NEXT: pextrb $2, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm1 +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: pextrb $4, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm1 +; SSE41-NEXT: pextrb $5, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm1 +; SSE41-NEXT: pextrb $6, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm1 +; SSE41-NEXT: pextrb $7, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm1 +; SSE41-NEXT: pextrb $8, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm1 +; SSE41-NEXT: pextrb $9, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm1 +; SSE41-NEXT: pextrb $10, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm1 +; SSE41-NEXT: pextrb $11, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm1 +; SSE41-NEXT: pextrb $12, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm1 +; SSE41-NEXT: pextrb $13, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $13, %eax, %xmm1 +; SSE41-NEXT: pextrb $14, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm1 +; SSE41-NEXT: pextrb $15, %xmm0, %eax +; SSE41-NEXT: movsbl %al, %eax +; SSE41-NEXT: imull $-109, %eax, %ecx +; SSE41-NEXT: shrl $8, %ecx +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movb %al, %cl +; SSE41-NEXT: shrb $7, %cl +; SSE41-NEXT: sarb $2, %al +; SSE41-NEXT: addb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; SSE-LABEL: test7: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm3 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm3 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm3 +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm4 +; SSE-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: imull $-109, %eax, %ecx +; SSE-NEXT: shrl $8, %ecx +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movb %cl, %al +; SSE-NEXT: shrb $7, %al +; SSE-NEXT: sarb $2, %cl +; SSE-NEXT: addb %al, %cl +; SSE-NEXT: movzbl %cl, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: retq +; ; AVX-LABEL: test7: -; AVX: pext +; AVX: # BB#0: +; AVX-NEXT: vpextrb $1, %xmm0, %eax +; AVX-NEXT: movsbl %al, %eax +; AVX-NEXT: imull $-109, %eax, %ecx +; AVX-NEXT: shrl $8, %ecx +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: movb %al, %cl +; AVX-NEXT: shrb $7, %cl +; AVX-NEXT: sarb $2, %al +; AVX-NEXT: addb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpextrb $0, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %dl +; AVX-NEXT: shrb $7, %dl +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movzbl %cl, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $3, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $4, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $5, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $6, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $7, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $8, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $9, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $10, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $11, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $12, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $13, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $14, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpextrb $15, %xmm0, %ecx +; AVX-NEXT: movsbl %cl, %ecx +; AVX-NEXT: imull $-109, %ecx, %edx +; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm0 +; AVX-NEXT: shrl $8, %edx +; AVX-NEXT: addb %dl, %cl +; AVX-NEXT: movb %cl, %al +; AVX-NEXT: shrb $7, %al +; AVX-NEXT: sarb $2, %cl +; AVX-NEXT: addb %al, %cl +; AVX-NEXT: movzbl %cl, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq + %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> + ret <16 x i8> %div } define <4 x i32> @test8(<4 x i32> %a) { - %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> - ret <4 x i32> %div - ; SSE41-LABEL: test8: -; SSE41: pmuldq -; SSE41: pshufd $49 -; SSE41-NOT: pshufd $49 -; SSE41: pmuldq -; SSE41: shufps $-35 -; SSE41: pshufd $-40 -; SSE41: padd -; SSE41: psrld $31 -; SSE41: psrad $2 -; SSE41: padd - +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pmuldq %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm2, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $31, %xmm0 +; SSE41-NEXT: psrad $2, %xmm1 +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; ; SSE-LABEL: test8: -; SSE: pmuludq -; SSE: pshufd $49 -; SSE-NOT: pshufd $49 -; SSE: pmuludq -; SSE: shufps $-35 -; SSE: pshufd $-40 -; SSE: psubd -; SSE: padd -; SSE: psrld $31 -; SSE: psrad $2 -; SSE: padd - +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: paddd %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: psubd %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrld $31, %xmm0 +; SSE-NEXT: psrad $2, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; ; AVX-LABEL: test8: -; AVX: vpmuldq -; AVX: vpshufd $49 -; AVX-NOT: vpshufd $49 -; AVX: vpmuldq -; AVX: vshufps $-35 -; AVX: vpshufd $-40 -; AVX: vpadd -; AVX: vpsrld $31 -; AVX: vpsrad $2 -; AVX: vpadd +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-NEXT: vpmuldq %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsrld $31, %xmm0, %xmm1 +; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> + ret <4 x i32> %div } define <8 x i32> @test9(<8 x i32> %a) { +; SSE41-LABEL: test9: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; SSE41-NEXT: # kill: XMM0<def> XMM3<kill> +; SSE41-NEXT: pmuldq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm4, %xmm5 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm5[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE41-NEXT: paddd %xmm3, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrld $31, %xmm3 +; SSE41-NEXT: psrad $2, %xmm0 +; SSE41-NEXT: paddd %xmm3, %xmm0 +; SSE41-NEXT: pmuldq %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm4, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrld $31, %xmm2 +; SSE41-NEXT: psrad $2, %xmm1 +; SSE41-NEXT: paddd %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; SSE-LABEL: test9: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psrad $31, %xmm4 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: psrad $31, %xmm5 +; SSE-NEXT: pand %xmm1, %xmm5 +; SSE-NEXT: paddd %xmm0, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] +; SSE-NEXT: pmuludq %xmm6, %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm7[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: psubd %xmm5, %xmm0 +; SSE-NEXT: paddd %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrld $31, %xmm3 +; SSE-NEXT: psrad $2, %xmm0 +; SSE-NEXT: paddd %xmm3, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: paddd %xmm4, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE-NEXT: pmuludq %xmm6, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE-NEXT: psubd %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrld $31, %xmm2 +; SSE-NEXT: psrad $2, %xmm1 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: test9: +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vpsrld $31, %ymm0, %ymm1 +; AVX-NEXT: vpsrad $2, %ymm0, %ymm0 +; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq %div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7> ret <8 x i32> %div - -; AVX-LABEL: test9: -; AVX: vpalignr $4 -; AVX: vpbroadcastd -; AVX: vpmuldq -; AVX: vpmuldq -; AVX: vpblendd $170 -; AVX: vpadd -; AVX: vpsrld $31 -; AVX: vpsrad $2 -; AVX: vpadd } define <8 x i32> @test10(<8 x i32> %a) { +; SSE41-LABEL: test10: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pmuludq %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm4, %xmm5 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psubd %xmm3, %xmm5 +; SSE41-NEXT: psrld $1, %xmm5 +; SSE41-NEXT: paddd %xmm3, %xmm5 +; SSE41-NEXT: psrld $2, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7] +; SSE41-NEXT: pmulld %xmm3, %xmm5 +; SSE41-NEXT: psubd %xmm5, %xmm0 +; SSE41-NEXT: pmuludq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuludq %xmm4, %xmm5 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: psubd %xmm2, %xmm4 +; SSE41-NEXT: psrld $1, %xmm4 +; SSE41-NEXT: paddd %xmm2, %xmm4 +; SSE41-NEXT: psrld $2, %xmm4 +; SSE41-NEXT: pmulld %xmm3, %xmm4 +; SSE41-NEXT: psubd %xmm4, %xmm1 +; SSE41-NEXT: retq +; +; SSE-LABEL: test10: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: psubd %xmm3, %xmm5 +; SSE-NEXT: psrld $1, %xmm5 +; SSE-NEXT: paddd %xmm3, %xmm5 +; SSE-NEXT: psrld $2, %xmm5 +; SSE-NEXT: movdqa {{.*#+}} xmm3 = [7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE-NEXT: pmuludq %xmm3, %xmm5 +; SSE-NEXT: pmuludq %xmm3, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2,1,3] +; SSE-NEXT: psubd %xmm5, %xmm0 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm5[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: psubd %xmm2, %xmm4 +; SSE-NEXT: psrld $1, %xmm4 +; SSE-NEXT: paddd %xmm2, %xmm4 +; SSE-NEXT: psrld $2, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: pmuludq %xmm3, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3] +; SSE-NEXT: psubd %xmm4, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: test10: +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm2 +; AVX-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vpsrld $2, %ymm1, %ymm1 +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq %rem = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7> ret <8 x i32> %rem - -; AVX-LABEL: test10: -; AVX: vpbroadcastd -; AVX: vpalignr $4 -; AVX: vpmuludq -; AVX: vpmuludq -; AVX: vpblendd $170 -; AVX: vpsubd -; AVX: vpsrld $1 -; AVX: vpadd -; AVX: vpsrld $2 -; AVX: vpmulld } define <8 x i32> @test11(<8 x i32> %a) { +; SSE41-LABEL: test11: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pmuldq %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm4, %xmm5 +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm5[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3] +; SSE41-NEXT: paddd %xmm0, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: psrld $31, %xmm5 +; SSE41-NEXT: psrad $2, %xmm3 +; SSE41-NEXT: paddd %xmm5, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [7,7,7,7] +; SSE41-NEXT: pmulld %xmm5, %xmm3 +; SSE41-NEXT: psubd %xmm3, %xmm0 +; SSE41-NEXT: pmuldq %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm4, %xmm3 +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE41-NEXT: paddd %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrld $31, %xmm3 +; SSE41-NEXT: psrad $2, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm2 +; SSE41-NEXT: pmulld %xmm5, %xmm2 +; SSE41-NEXT: psubd %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; SSE-LABEL: test11: +; SSE: # BB#0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: pand %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: psrad $31, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: paddd %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm7 +; SSE-NEXT: pmuludq %xmm2, %xmm7 +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,3],xmm4[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] +; SSE-NEXT: psubd %xmm6, %xmm7 +; SSE-NEXT: paddd %xmm0, %xmm7 +; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: psrld $31, %xmm4 +; SSE-NEXT: psrad $2, %xmm7 +; SSE-NEXT: paddd %xmm4, %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [7,7,7,7] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm7 +; SSE-NEXT: pmuludq %xmm4, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3] +; SSE-NEXT: psubd %xmm7, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: movdqa %xmm1, %xmm6 +; SSE-NEXT: psrad $31, %xmm6 +; SSE-NEXT: pand %xmm2, %xmm6 +; SSE-NEXT: paddd %xmm3, %xmm6 +; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm5, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: psubd %xmm6, %xmm2 +; SSE-NEXT: paddd %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm3 +; SSE-NEXT: psrld $31, %xmm3 +; SSE-NEXT: psrad $2, %xmm2 +; SSE-NEXT: paddd %xmm3, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE-NEXT: psubd %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX-LABEL: test11: +; AVX: # BB#0: +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpmuldq %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vpmuldq %ymm1, %ymm0, %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] +; AVX-NEXT: vpaddd %ymm0, %ymm1, %ymm1 +; AVX-NEXT: vpsrld $31, %ymm1, %ymm2 +; AVX-NEXT: vpsrad $2, %ymm1, %ymm1 +; AVX-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2 +; AVX-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq %rem = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7> ret <8 x i32> %rem - -; AVX-LABEL: test11: -; AVX: vpalignr $4 -; AVX: vpbroadcastd -; AVX: vpmuldq -; AVX: vpmuldq -; AVX: vpblendd $170 -; AVX: vpadd -; AVX: vpsrld $31 -; AVX: vpsrad $2 -; AVX: vpadd -; AVX: vpmulld } define <2 x i16> @test12() { +; SSE41-LABEL: test12: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; SSE-LABEL: test12: +; SSE: # BB#0: +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test12: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq %I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0 %I9 = insertelement <2 x i16> %I8, i16 -1, i32 1 %B9 = urem <2 x i16> %I9, %I9 ret <2 x i16> %B9 +} -; AVX-LABEL: test12: -; AVX: xorps +define <4 x i32> @PR20355(<4 x i32> %a) { +; SSE41-LABEL: PR20355: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE41-NEXT: pmuldq %xmm2, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: psrld $31, %xmm1 +; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; SSE-LABEL: PR20355: +; SSE: # BB#0: # %entry +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766] +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrad $31, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 +; SSE-NEXT: paddd %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE-NEXT: pmuludq %xmm2, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: psubd %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $31, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: PR20355: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,3],xmm0[1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vpsrld $31, %xmm0, %xmm1 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %sdiv = sdiv <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3> + ret <4 x i32> %sdiv } diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll new file mode 100644 index 000000000000..758833155a96 --- /dev/null +++ b/test/CodeGen/X86/vector-sext.ll @@ -0,0 +1,942 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; +; Just one 32-bit run to make sure we do reasonable things there. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=i686 -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE41 + +define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_8i16_to_8i32: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: # kill: XMM0<def> XMM1<kill> +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_8i16_to_8i32: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: # kill: XMM0<def> XMM1<kill> +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $16, %xmm0 +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $16, %xmm1 +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_8i16_to_8i32: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pmovzxwd %xmm1, %xmm0 +; SSE41-NEXT: pslld $16, %xmm0 +; SSE41-NEXT: psrad $16, %xmm0 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pslld $16, %xmm1 +; SSE41-NEXT: psrad $16, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_8i16_to_8i32: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_8i16_to_8i32: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: sext_8i16_to_8i32: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movdqa %xmm0, %xmm1 +; X32-SSE41-NEXT: pmovzxwd %xmm1, %xmm0 +; X32-SSE41-NEXT: pslld $16, %xmm0 +; X32-SSE41-NEXT: psrad $16, %xmm0 +; X32-SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X32-SSE41-NEXT: pslld $16, %xmm1 +; X32-SSE41-NEXT: psrad $16, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %B = sext <8 x i16> %A to <8 x i32> + ret <8 x i32>%B +} + +define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: sext_4i32_to_4i64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i32_to_4i64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSSE3-NEXT: movd %xmm1, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm1, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i32_to_4i64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm3 +; SSE41-NEXT: movd %xmm1, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm3 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_4i32_to_4i64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_4i32_to_4i64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: sext_4i32_to_4i64: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; X32-SSE41-NEXT: movd %xmm1, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl +entry: + %B = sext <4 x i32> %A to <4 x i64> + ret <4 x i64>%B +} + +define <4 x i32> @load_sext_test1(<4 x i16> *%ptr) { +; SSE2-LABEL: load_sext_test1: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movq (%rdi), %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_test1: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movq (%rdi), %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $16, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_test1: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_test1: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_test1: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i16>* %ptr + %Y = sext <4 x i16> %X to <4 x i32> + ret <4 x i32>%Y +} + +define <4 x i32> @load_sext_test2(<4 x i8> *%ptr) { +; SSE2-LABEL: load_sext_test2: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movd (%rdi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_test2: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_test2: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_test2: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_test2: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i8>* %ptr + %Y = sext <4 x i8> %X to <4 x i32> + ret <4 x i32>%Y +} + +define <2 x i64> @load_sext_test3(<2 x i8> *%ptr) { +; SSE2-LABEL: load_sext_test3: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movsbq 1(%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movsbq (%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_test3: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movsbq 1(%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movsbq (%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_test3: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_test3: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_test3: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i8>* %ptr + %Y = sext <2 x i8> %X to <2 x i64> + ret <2 x i64>%Y +} + +define <2 x i64> @load_sext_test4(<2 x i16> *%ptr) { +; SSE2-LABEL: load_sext_test4: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movswq 2(%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movswq (%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_test4: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movswq 2(%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movswq (%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_test4: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_test4: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_test4: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i16>* %ptr + %Y = sext <2 x i16> %X to <2 x i64> + ret <2 x i64>%Y +} + +define <2 x i64> @load_sext_test5(<2 x i32> *%ptr) { +; SSE2-LABEL: load_sext_test5: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movslq 4(%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movslq (%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_test5: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movslq 4(%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movslq (%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_test5: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovsxdq (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_test5: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovsxdq (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_test5: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <2 x i32>* %ptr + %Y = sext <2 x i32> %X to <2 x i64> + ret <2 x i64>%Y +} + +define <8 x i16> @load_sext_test6(<8 x i8> *%ptr) { +; SSE2-LABEL: load_sext_test6: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movq (%rdi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_test6: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movq (%rdi), %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_test6: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: load_sext_test6: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_test6: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0 +; X32-SSE41-NEXT: retl +entry: + %X = load <8 x i8>* %ptr + %Y = sext <8 x i8> %X to <8 x i16> + ret <8 x i16>%Y +} + +define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { +; SSE2-LABEL: sext_4i1_to_4i64: +; SSE2: # BB#0: +; SSE2-NEXT: pslld $31, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i1_to_4i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: pslld $31, %xmm0 +; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSSE3-NEXT: movd %xmm1, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm1, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i1_to_4i64: +; SSE41: # BB#0: +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm3 +; SSE41-NEXT: movd %xmm1, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm3 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_4i1_to_4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_4i1_to_4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: sext_4i1_to_4i64: +; X32-SSE41: # BB#0: +; X32-SSE41-NEXT: pslld $31, %xmm0 +; X32-SSE41-NEXT: psrad $31, %xmm0 +; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; X32-SSE41-NEXT: movd %xmm1, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl + %extmask = sext <4 x i1> %mask to <4 x i64> + ret <4 x i64> %extmask +} + +define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { +; SSE2-LABEL: sext_16i8_to_16i16: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_16i8_to_16i16: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: psllw $8, %xmm0 +; SSSE3-NEXT: psraw $8, %xmm0 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: psllw $8, %xmm1 +; SSSE3-NEXT: psraw $8, %xmm1 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_16i8_to_16i16: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbw %xmm1, %xmm0 +; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: psraw $8, %xmm0 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: psllw $8, %xmm1 +; SSE41-NEXT: psraw $8, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_16i8_to_16i16: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_16i8_to_16i16: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovsxbw (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: sext_16i8_to_16i16: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movdqa (%eax), %xmm1 +; X32-SSE41-NEXT: pmovzxbw %xmm1, %xmm0 +; X32-SSE41-NEXT: psllw $8, %xmm0 +; X32-SSE41-NEXT: psraw $8, %xmm0 +; X32-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-SSE41-NEXT: psllw $8, %xmm1 +; X32-SSE41-NEXT: psraw $8, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <16 x i8>* %ptr + %Y = sext <16 x i8> %X to <16 x i16> + ret <16 x i16> %Y +} + +define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { +; SSE2-LABEL: sext_4i8_to_4i64: +; SSE2: # BB#0: +; SSE2-NEXT: pslld $24, %xmm0 +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movd %xmm1, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: cltq +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: sext_4i8_to_4i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: pslld $24, %xmm0 +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSSE3-NEXT: movd %xmm1, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: movd %xmm1, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: cltq +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: sext_4i8_to_4i64: +; SSE41: # BB#0: +; SSE41-NEXT: pslld $24, %xmm0 +; SSE41-NEXT: psrad $24, %xmm0 +; SSE41-NEXT: pmovzxdq %xmm0, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm3 +; SSE41-NEXT: movd %xmm1, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm3 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: cltq +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: sext_4i8_to_4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: sext_4i8_to_4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0 +; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: sext_4i8_to_4i64: +; X32-SSE41: # BB#0: +; X32-SSE41-NEXT: pslld $24, %xmm0 +; X32-SSE41-NEXT: psrad $24, %xmm0 +; X32-SSE41-NEXT: pmovzxdq %xmm0, %xmm2 +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pextrd $2, %xmm2, %ecx +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm2 +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm2 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; X32-SSE41-NEXT: movd %xmm1, %eax +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pextrd $2, %xmm1, %ecx +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %ecx +; X32-SSE41-NEXT: pinsrd $3, %ecx, %xmm1 +; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X32-SSE41-NEXT: retl + %extmask = sext <4 x i8> %mask to <4 x i64> + ret <4 x i64> %extmask +} + +define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { +; SSE2-LABEL: load_sext_4i8_to_4i64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movd (%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i8_to_4i64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movd (%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i8_to_4i64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 +; SSE41-NEXT: pmovzxdq %xmm1, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: movsbq %al, %rax +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: movsbq %al, %rax +; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: movsbq %al, %rax +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: movd %xmm1, %rax +; SSE41-NEXT: movsbq %al, %rax +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i8_to_4i64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i8_to_4i64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_4i8_to_4i64: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movd (%eax), %xmm0 +; X32-SSE41-NEXT: pmovzxbd %xmm0, %xmm1 +; X32-SSE41-NEXT: pmovzxbq %xmm0, %xmm2 +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: movsbl %al, %eax +; X32-SSE41-NEXT: movd %eax, %xmm0 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax +; X32-SSE41-NEXT: movsbl %al, %eax +; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: movsbl %al, %eax +; X32-SSE41-NEXT: movd %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax +; X32-SSE41-NEXT: movsbl %al, %eax +; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i8>* %ptr + %Y = sext <4 x i8> %X to <4 x i64> + ret <4 x i64>%Y +} + +define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { +; SSE2-LABEL: load_sext_4i16_to_4i64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movq (%rdi), %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movswq %ax, %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movswq %ax, %rax +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movswq %ax, %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movd %xmm2, %rax +; SSE2-NEXT: movswq %ax, %rax +; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_sext_4i16_to_4i64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movq (%rdi), %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movswq %ax, %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movswq %ax, %rax +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movswq %ax, %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSSE3-NEXT: movd %xmm2, %rax +; SSSE3-NEXT: movswq %ax, %rax +; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_sext_4i16_to_4i64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movq (%rdi), %xmm0 +; SSE41-NEXT: pmovzxwd %xmm0, %xmm1 +; SSE41-NEXT: pmovzxwq %xmm0, %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: movswq %ax, %rax +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: movswq %ax, %rax +; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: movswq %ax, %rax +; SSE41-NEXT: movd %rax, %xmm2 +; SSE41-NEXT: movd %xmm1, %rax +; SSE41-NEXT: movswq %ax, %rax +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: load_sext_4i16_to_4i64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_sext_4i16_to_4i64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0 +; AVX2-NEXT: retq +; +; X32-SSE41-LABEL: load_sext_4i16_to_4i64: +; X32-SSE41: # BB#0: # %entry +; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE41-NEXT: movsd (%eax), %xmm0 +; X32-SSE41-NEXT: pmovzxwd %xmm0, %xmm1 +; X32-SSE41-NEXT: pmovzxwq %xmm0, %xmm2 +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: cwtl +; X32-SSE41-NEXT: movd %eax, %xmm0 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax +; X32-SSE41-NEXT: cwtl +; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 +; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; X32-SSE41-NEXT: movd %xmm2, %eax +; X32-SSE41-NEXT: cwtl +; X32-SSE41-NEXT: movd %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax +; X32-SSE41-NEXT: cwtl +; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1 +; X32-SSE41-NEXT: sarl $31, %eax +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; X32-SSE41-NEXT: retl +entry: + %X = load <4 x i16>* %ptr + %Y = sext <4 x i16> %X to <4 x i64> + ret <4 x i64>%Y +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 4da7e42caabf..14058c912861 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1,196 +1,1088 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; FIXME: SSE2 should look like the following: +; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 +; FIXME: # BB#0: +; FIXME-NEXT: punpcklbw %xmm0, %xmm0 +; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1] +; FIXME-NEXT: retq +; +; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pshufb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pshufb %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,5,5] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,2,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,6,6,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,2,4,5,6,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: punpcklwd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; SSE: # BB#0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; SSE: # BB#0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07: +; SSE: # BB#0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_0101010101010101 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; FIXME: SSE2 should be the following: +; FIXME-LABEL: @shuffle_v16i8_0101010101010101 +; FIXME: # BB#0: +; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] +; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1] +; FIXME-NEXT: retq +; +; SSE2-LABEL: shuffle_v16i8_0101010101010101: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_0101010101010101: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_0101010101010101: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v16i8_0101010101010101: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_0101010101010101: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23 -; CHECK-SSE2: punpcklbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: +; SSE: # BB#0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> ret <16 x i8> %shuffle } +define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31: +; SSE: # BB#0: +; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> + ret <16 x i8> %shuffle +} + define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklbw %xmm0, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; SSE: # BB#0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; AVX1: # BB#0: +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12 -; CHECK-SSE2: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: punpckhbw %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: packuswb %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20 -; CHECK-SSE2: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[3,2,1,0,7,6,5,4] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 19, i32 18, i32 17, i32 16, i32 23, i32 22, i32 21, i32 20> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) { -; CHECK-SSE2-LABEL: @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20 -; CHECK-SSE2: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: punpckhbw %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm4 = xmm4[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: shufpd {{.*}} # xmm4 = xmm4[0],xmm3[1] -; CHECK-SSE2-NEXT: punpckhbw %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: punpcklbw %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] -; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] +; SSE2-NEXT: movsd %xmm4, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[15,14,13,12],zero,zero,zero,zero,xmm1[7,6,5,4] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0],zero,zero,zero,zero,xmm0[11,10,9,8],zero,zero,zero,zero +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 31, i32 30, i32 29, i32 28, i32 11, i32 10, i32 9, i32 8, i32 23, i32 22, i32 21, i32 20> ret <16 x i8> %shuffle } -define <16 x i8> @zext_to_v8i16_shuffle(<16 x i8> %a) { -; CHECK-SSE2-LABEL: @zext_to_v8i16_shuffle -; CHECK-SSE2: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 - %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31> +define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) { +; SSE2-LABEL: trunc_v4i32_shuffle: +; SSE2: # BB#0: +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: trunc_v4i32_shuffle: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: trunc_v4i32_shuffle: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: retq +; +; AVX-LABEL: trunc_v4i32_shuffle: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %shuffle +} + +define <16 x i8> @stress_test0(<16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i8> %s.0.7, <16 x i8> %s.0.8, <16 x i8> %s.0.9) { +; We don't have anything useful to check here. This generates 100s of +; instructions. Instead, just make sure we survived codegen. +; ALL-LABEL: stress_test0: +; ALL: retq +entry: + %s.1.4 = shufflevector <16 x i8> %s.0.4, <16 x i8> %s.0.5, <16 x i32> <i32 1, i32 22, i32 21, i32 28, i32 3, i32 16, i32 6, i32 1, i32 19, i32 29, i32 12, i32 31, i32 2, i32 3, i32 3, i32 6> + %s.1.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> %s.0.6, <16 x i32> <i32 31, i32 20, i32 12, i32 19, i32 2, i32 15, i32 12, i32 31, i32 2, i32 28, i32 2, i32 30, i32 7, i32 8, i32 17, i32 28> + %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> %s.0.9, <16 x i32> <i32 14, i32 10, i32 17, i32 5, i32 17, i32 9, i32 17, i32 21, i32 31, i32 24, i32 16, i32 6, i32 20, i32 28, i32 23, i32 8> + %s.2.2 = shufflevector <16 x i8> %s.0.3, <16 x i8> %s.0.4, <16 x i32> <i32 20, i32 9, i32 21, i32 11, i32 11, i32 4, i32 3, i32 18, i32 3, i32 30, i32 4, i32 31, i32 11, i32 24, i32 13, i32 29> + %s.3.2 = shufflevector <16 x i8> %s.2.2, <16 x i8> %s.1.4, <16 x i32> <i32 15, i32 13, i32 5, i32 11, i32 7, i32 17, i32 14, i32 22, i32 22, i32 16, i32 7, i32 24, i32 16, i32 22, i32 7, i32 29> + %s.5.4 = shufflevector <16 x i8> %s.1.5, <16 x i8> %s.1.8, <16 x i32> <i32 3, i32 13, i32 19, i32 7, i32 23, i32 11, i32 1, i32 9, i32 16, i32 25, i32 2, i32 7, i32 0, i32 21, i32 23, i32 17> + %s.6.1 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.3.2, <16 x i32> <i32 11, i32 2, i32 28, i32 31, i32 27, i32 3, i32 9, i32 27, i32 25, i32 25, i32 14, i32 7, i32 12, i32 28, i32 12, i32 23> + %s.7.1 = shufflevector <16 x i8> %s.6.1, <16 x i8> %s.3.2, <16 x i32> <i32 15, i32 29, i32 14, i32 0, i32 29, i32 15, i32 26, i32 30, i32 6, i32 7, i32 2, i32 8, i32 12, i32 10, i32 29, i32 17> + %s.7.2 = shufflevector <16 x i8> %s.3.2, <16 x i8> %s.5.4, <16 x i32> <i32 3, i32 29, i32 3, i32 19, i32 undef, i32 20, i32 undef, i32 3, i32 27, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef> + %s.16.0 = shufflevector <16 x i8> %s.7.1, <16 x i8> %s.7.2, <16 x i32> <i32 13, i32 1, i32 16, i32 16, i32 6, i32 7, i32 29, i32 18, i32 19, i32 28, i32 undef, i32 undef, i32 31, i32 1, i32 undef, i32 10> + ret <16 x i8> %s.16.0 +} + +define <16 x i8> @stress_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0.9) noinline nounwind { +; There is nothing interesting to check about these instructions other than +; that they survive codegen. However, we actually do better and delete all of +; them because the result is 'undef'. +; +; ALL-LABEL: stress_test1: +; ALL: # BB#0: # %entry +; ALL-NEXT: retq +entry: + %s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> <i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 6, i32 undef, i32 6, i32 undef, i32 14, i32 14, i32 undef, i32 undef, i32 0> + %s.2.4 = shufflevector <16 x i8> undef, <16 x i8> %s.0.5, <16 x i32> <i32 21, i32 undef, i32 undef, i32 19, i32 undef, i32 undef, i32 29, i32 24, i32 21, i32 23, i32 21, i32 17, i32 19, i32 undef, i32 20, i32 22> + %s.2.5 = shufflevector <16 x i8> %s.0.5, <16 x i8> undef, <16 x i32> <i32 3, i32 8, i32 undef, i32 7, i32 undef, i32 10, i32 8, i32 0, i32 15, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 9> + %s.2.9 = shufflevector <16 x i8> %s.0.9, <16 x i8> undef, <16 x i32> <i32 7, i32 undef, i32 14, i32 7, i32 8, i32 undef, i32 7, i32 8, i32 5, i32 15, i32 undef, i32 1, i32 11, i32 undef, i32 undef, i32 11> + %s.3.4 = shufflevector <16 x i8> %s.2.4, <16 x i8> %s.0.5, <16 x i32> <i32 5, i32 0, i32 21, i32 6, i32 15, i32 27, i32 22, i32 21, i32 4, i32 22, i32 19, i32 26, i32 9, i32 26, i32 8, i32 29> + %s.3.9 = shufflevector <16 x i8> %s.2.9, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 8, i32 1, i32 undef, i32 4, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 undef> + %s.4.7 = shufflevector <16 x i8> %s.1.8, <16 x i8> %s.2.9, <16 x i32> <i32 9, i32 0, i32 22, i32 20, i32 24, i32 7, i32 21, i32 17, i32 20, i32 12, i32 19, i32 23, i32 2, i32 9, i32 17, i32 10> + %s.4.8 = shufflevector <16 x i8> %s.2.9, <16 x i8> %s.3.9, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 6, i32 10, i32 undef, i32 0, i32 5, i32 undef, i32 9, i32 undef> + %s.5.7 = shufflevector <16 x i8> %s.4.7, <16 x i8> %s.4.8, <16 x i32> <i32 16, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %s.8.4 = shufflevector <16 x i8> %s.3.4, <16 x i8> %s.5.7, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 28, i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %s.9.4 = shufflevector <16 x i8> %s.8.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 5> + %s.10.4 = shufflevector <16 x i8> %s.9.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + %s.12.4 = shufflevector <16 x i8> %s.10.4, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef> + + ret <16 x i8> %s.12.4 +} + +define <16 x i8> @PR20540(<8 x i8> %a) { +; SSE2-LABEL: PR20540: +; SSE2: # BB#0: +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR20540: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR20540: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: PR20540: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE2-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { +; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; SSE: # BB#0: +; SSE-NEXT: movd %edi, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 0 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { +; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %a = insertelement <16 x i8> undef, i8 %i, i32 3 + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) { +; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: +; SSE: # BB#0: +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu: +; AVX: # BB#0: +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { +; SSE-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE: # BB#0: +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 undef, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14> ret <16 x i8> %shuffle } -define <16 x i8> @zext_to_v4i32_shuffle(<16 x i8> %a) { -; CHECK-SSE2-LABEL: @zext_to_v4i32_shuffle -; CHECK-SSE2: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 -; CHECK-SSE2-NEXT: punpcklbw %xmm1, %xmm0 +define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: +; SSSE3: # BB#0: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd %xmm0, %xmm0 +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31> ret <16 x i8> %shuffle } -define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) { -; CHECK-SSE2-LABEL: @trunc_v4i32_shuffle -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pand -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: packuswb %xmm0, %xmm0 -; CHECK-SSE2-NEXT: retq - %shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> +define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: +; SSSE3: # BB#0: +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbw %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbw %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef> + ret <16 x i8> %shuffle +} + +define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) { +; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbw %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbw %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31> ret <16 x i8> %shuffle } + +define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) { +; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,0,3,1,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,3,1,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: packuswb %xmm0, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,1,3,4,5,6,7] +; SSE2-NEXT: packuswb %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u] +; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,7,1,11,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,6,2,2,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,7,14,2,3,14,9,0,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX-NEXT: retq +entry: + %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0> + + ret <16 x i8> %shuffle +} + +define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) { +; Nothing interesting to test here. Just make sure we didn't crashe. +; ALL-LABEL: stress_test2: +; ALL: retq +entry: + %s.1.0 = shufflevector <16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i32> <i32 29, i32 30, i32 2, i32 16, i32 26, i32 21, i32 11, i32 26, i32 26, i32 3, i32 4, i32 5, i32 30, i32 28, i32 15, i32 5> + %s.1.1 = shufflevector <16 x i8> %s.0.1, <16 x i8> %s.0.2, <16 x i32> <i32 31, i32 1, i32 24, i32 12, i32 28, i32 5, i32 2, i32 9, i32 29, i32 1, i32 31, i32 5, i32 6, i32 17, i32 15, i32 22> + %s.2.0 = shufflevector <16 x i8> %s.1.0, <16 x i8> %s.1.1, <16 x i32> <i32 22, i32 1, i32 12, i32 3, i32 30, i32 4, i32 30, i32 undef, i32 1, i32 10, i32 14, i32 18, i32 27, i32 13, i32 16, i32 19> + + ret <16 x i8> %s.2.0 +} + +define void @constant_gets_selected(<4 x i32>* %ptr1, <4 x i32>* %ptr2) { +; SSE-LABEL: constant_gets_selected: +; SSE: # BB#0: # %entry +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movaps %xmm0, (%rdi) +; SSE-NEXT: movaps %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX-LABEL: constant_gets_selected: +; AVX: # BB#0: # %entry +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rdi) +; AVX-NEXT: vmovaps %xmm0, (%rsi) +; AVX-NEXT: retq +entry: + %weird_zero = bitcast <4 x i32> zeroinitializer to <16 x i8> + %shuffle.i = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %weird_zero, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27> + %weirder_zero = bitcast <16 x i8> %shuffle.i to <4 x i32> + store <4 x i32> %weirder_zero, <4 x i32>* %ptr1, align 16 + store <4 x i32> zeroinitializer, <4 x i32>* %ptr2, align 16 + ret void +} + +define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) { +; SSE2-LABEL: PR12412: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR12412: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,2,4,6,8,10,12,14] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; SSSE3-NEXT: por %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR12412: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,2,4,6,8,10,12,14] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: PR12412: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,2,4,6,8,10,12,14] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +entry: + %0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30> + ret <16 x i8> %0 +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index 78b4ee7e5dd0..57fa0e859813 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1,219 +1,1134 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" define <2 x i64> @shuffle_v2i64_00(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_00 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_00: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 0> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_10(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_10 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[2,3,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_10: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_10: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 0> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_11(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_11 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_11: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_11: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_22(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_22 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[0,1,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_22: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; SSE-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_22: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_22: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm1, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 2> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_32(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_32 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_32: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_32: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 2> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_33 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_33: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_33: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 3> ret <2 x i64> %shuffle } define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2f64_00 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0,0] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2f64_00: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2f64_00: +; SSE3: # BB#0: +; SSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2f64_00: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2f64_00: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_00: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2f64_10 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2f64_10: +; SSE: # BB#0: +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_10: +; AVX: # BB#0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 0> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2f64_11 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2f64_11: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_11: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) { -; FIXME: Should these use movapd + shufpd to remove a domain change at the cost -; of a mov? +; SSE2-LABEL: shuffle_v2f64_22: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2f64_22: +; SSE3: # BB#0: +; SSE3-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq ; -; CHECK-SSE2-LABEL: @shuffle_v2f64_22 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[0,1,0,1] -; CHECK-SSE2-NEXT: retq +; SSSE3-LABEL: shuffle_v2f64_22: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] +; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2f64_22: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_22: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0,0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2f64_32 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2f64_32: +; SSE: # BB#0: +; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_32: +; AVX: # BB#0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 2> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2f64_33 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2f64_33: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_33: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm1[1,1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3> ret <2 x double> %shuffle } +define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) { +; SSE2-LABEL: shuffle_v2f64_03: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2f64_03: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2f64_03: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2f64_03: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_03: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX-NEXT: retq + %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %shuffle +} +define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) { +; SSE2-LABEL: shuffle_v2f64_21: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2f64_21: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2f64_21: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2f64_21: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_21: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: retq + %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1> + ret <2 x double> %shuffle +} define <2 x i64> @shuffle_v2i64_02(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_02 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_02: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_02: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_02_copy -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[0] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_02_copy: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_02_copy: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm2[0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_03 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_03: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_03: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_03: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_03: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_03: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_03: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_03_copy -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_03_copy: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm2 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_03_copy: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm1, %xmm2 +; SSE3-NEXT: movaps %xmm2, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_03_copy: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm1, %xmm2 +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_03_copy: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_03_copy: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_03_copy: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_12 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_12: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_12: +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_12: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_12: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_12: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_12_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_12_copy -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm2[0] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_12_copy: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_12_copy: +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_12_copy: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_12_copy: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_12_copy: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_13(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_13 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_13: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_13: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_13_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_13_copy -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_13_copy: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_13_copy: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_20(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_20 -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_20: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_20: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_20_copy -; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[0] -; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_20_copy: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_20_copy: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_21 -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_21: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_21: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_21: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_21: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_21: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_21: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_21_copy -; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1] -; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_21_copy: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm2, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_21_copy: +; SSE3: # BB#0: +; SSE3-NEXT: movsd %xmm2, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_21_copy: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm2, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_21_copy: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_21_copy: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_21_copy: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3] +; AVX2-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_30(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_30 -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm0[0] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_30: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_30: +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_30: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_30: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_30: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_30_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_30_copy -; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[1],xmm1[0] -; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v2i64_30_copy: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_30_copy: +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0] +; SSE3-NEXT: movapd %xmm2, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_30_copy: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_30_copy: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_30_copy: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_31(<2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_31 -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[1],xmm0[1] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_31: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_31: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; CHECK-SSE2-LABEL: @shuffle_v2i64_31_copy -; CHECK-SSE2: shufpd {{.*}} # xmm2 = xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movapd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v2i64_31_copy: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_31_copy: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm1[1] +; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1> ret <2 x i64> %shuffle } + +define <2 x i64> @shuffle_v2i64_0z(<2 x i64> %a) { +; SSE-LABEL: shuffle_v2i64_0z: +; SSE: # BB#0: +; SSE-NEXT: movq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_0z: +; AVX: # BB#0: +; AVX-NEXT: vmovq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3> + ret <2 x i64> %shuffle +} + +define <2 x i64> @shuffle_v2i64_1z(<2 x i64> %a) { +; SSE-LABEL: shuffle_v2i64_1z: +; SSE: # BB#0: +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_1z: +; AVX: # BB#0: +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 1, i32 3> + ret <2 x i64> %shuffle +} + +define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) { +; SSE-LABEL: shuffle_v2i64_z0: +; SSE: # BB#0: +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2i64_z0: +; AVX: # BB#0: +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq + %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 0> + ret <2 x i64> %shuffle +} + +define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) { +; SSE2-LABEL: shuffle_v2i64_z1: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2i64_z1: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2i64_z1: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2i64_z1: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v2i64_z1: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v2i64_z1: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq + %shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 1> + ret <2 x i64> %shuffle +} + +define <2 x double> @shuffle_v2f64_0z(<2 x double> %a) { +; SSE-LABEL: shuffle_v2f64_0z: +; SSE: # BB#0: +; SSE-NEXT: movq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_0z: +; AVX: # BB#0: +; AVX-NEXT: vmovq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3> + ret <2 x double> %shuffle +} + +define <2 x double> @shuffle_v2f64_1z(<2 x double> %a) { +; SSE-LABEL: shuffle_v2f64_1z: +; SSE: # BB#0: +; SSE-NEXT: xorpd %xmm1, %xmm1 +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_1z: +; AVX: # BB#0: +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: retq + %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 3> + ret <2 x double> %shuffle +} + +define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) { +; SSE-LABEL: shuffle_v2f64_z0: +; SSE: # BB#0: +; SSE-NEXT: xorpd %xmm1, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_z0: +; AVX: # BB#0: +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq + %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 0> + ret <2 x double> %shuffle +} + +define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) { +; SSE2-LABEL: shuffle_v2f64_z1: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v2f64_z1: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v2f64_z1: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v2f64_z1: +; SSE41: # BB#0: +; SSE41-NEXT: xorpd %xmm1, %xmm1 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_z1: +; AVX: # BB#0: +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: retq + %shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1> + ret <2 x double> %shuffle +} + +define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) { +; SSE-LABEL: insert_reg_and_zero_v2i64: +; SSE: # BB#0: +; SSE-NEXT: movd %rdi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_and_zero_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vmovq %rdi, %xmm0 +; AVX-NEXT: retq + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3> + ret <2 x i64> %shuffle +} + +define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) { +; SSE-LABEL: insert_mem_and_zero_v2i64: +; SSE: # BB#0: +; SSE-NEXT: movq (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_and_zero_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vmovq (%rdi), %xmm0 +; AVX-NEXT: retq + %a = load i64* %ptr + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3> + ret <2 x i64> %shuffle +} + +define <2 x double> @insert_reg_and_zero_v2f64(double %a) { +; SSE-LABEL: insert_reg_and_zero_v2f64: +; SSE: # BB#0: +; SSE-NEXT: movq %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_and_zero_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovq %xmm0, %xmm0 +; AVX-NEXT: retq + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3> + ret <2 x double> %shuffle +} + +define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) { +; SSE-LABEL: insert_mem_and_zero_v2f64: +; SSE: # BB#0: +; SSE-NEXT: movsd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_and_zero_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovsd (%rdi), %xmm0 +; AVX-NEXT: retq + %a = load double* %ptr + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3> + ret <2 x double> %shuffle +} + +define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) { +; SSE2-LABEL: insert_reg_lo_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movd %rdi, %xmm1 +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_reg_lo_v2i64: +; SSE3: # BB#0: +; SSE3-NEXT: movd %rdi, %xmm1 +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_reg_lo_v2i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %rdi, %xmm1 +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_reg_lo_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: movd %rdi, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: insert_reg_lo_v2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq %rdi, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_reg_lo_v2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3> + ret <2 x i64> %shuffle +} + +define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) { +; SSE2-LABEL: insert_mem_lo_v2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movlpd (%rdi), %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_mem_lo_v2i64: +; SSE3: # BB#0: +; SSE3-NEXT: movlpd (%rdi), %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_mem_lo_v2i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movlpd (%rdi), %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_mem_lo_v2i64: +; SSE41: # BB#0: +; SSE41-NEXT: movq (%rdi), %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: insert_mem_lo_v2i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq (%rdi), %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_mem_lo_v2i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq + %a = load i64* %ptr + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3> + ret <2 x i64> %shuffle +} + +define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) { +; SSE-LABEL: insert_reg_hi_v2i64: +; SSE: # BB#0: +; SSE-NEXT: movd %rdi, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_hi_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vmovq %rdi, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0> + ret <2 x i64> %shuffle +} + +define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) { +; SSE-LABEL: insert_mem_hi_v2i64: +; SSE: # BB#0: +; SSE-NEXT: movq (%rdi), %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_hi_v2i64: +; AVX: # BB#0: +; AVX-NEXT: vmovq (%rdi), %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %a = load i64* %ptr + %v = insertelement <2 x i64> undef, i64 %a, i32 0 + %shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0> + ret <2 x i64> %shuffle +} + +define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) { +; SSE-LABEL: insert_reg_lo_v2f64: +; SSE: # BB#0: +; SSE-NEXT: movsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_lo_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %shuffle +} + +define <2 x double> @insert_mem_lo_v2f64(double* %ptr, <2 x double> %b) { +; SSE-LABEL: insert_mem_lo_v2f64: +; SSE: # BB#0: +; SSE-NEXT: movlpd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_lo_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovlpd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = load double* %ptr + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 0, i32 3> + ret <2 x double> %shuffle +} + +define <2 x double> @insert_reg_hi_v2f64(double %a, <2 x double> %b) { +; SSE-LABEL: insert_reg_hi_v2f64: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_hi_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 2, i32 0> + ret <2 x double> %shuffle +} + +define <2 x double> @insert_mem_hi_v2f64(double* %ptr, <2 x double> %b) { +; SSE-LABEL: insert_mem_hi_v2f64: +; SSE: # BB#0: +; SSE-NEXT: movhpd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_hi_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovhpd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = load double* %ptr + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 2, i32 0> + ret <2 x double> %shuffle +} + +define <2 x double> @insert_dup_reg_v2f64(double %a) { +; FIXME: We should match movddup for SSE3 and higher here. +; +; SSE2-LABEL: insert_dup_reg_v2f64: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_dup_reg_v2f64: +; SSE3: # BB#0: +; SSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_dup_reg_v2f64: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_reg_v2f64: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_reg_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: retq + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0> + ret <2 x double> %shuffle +} +define <2 x double> @insert_dup_mem_v2f64(double* %ptr) { +; SSE2-LABEL: insert_dup_mem_v2f64: +; SSE2: # BB#0: +; SSE2-NEXT: movsd (%rdi), %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_dup_mem_v2f64: +; SSE3: # BB#0: +; SSE3-NEXT: movddup (%rdi), %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_dup_mem_v2f64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movddup (%rdi), %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_mem_v2f64: +; SSE41: # BB#0: +; SSE41-NEXT: movddup (%rdi), %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_dup_mem_v2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup (%rdi), %xmm0 +; AVX-NEXT: retq + %a = load double* %ptr + %v = insertelement <2 x double> undef, double %a, i32 0 + %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0> + ret <2 x double> %shuffle +} + +define <2 x double> @shuffle_mem_v2f64_10(<2 x double>* %ptr) { +; SSE-LABEL: shuffle_mem_v2f64_10: +; SSE: # BB#0: +; SSE-NEXT: movapd (%rdi), %xmm0 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_mem_v2f64_10: +; AVX: # BB#0: +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0] +; AVX-NEXT: retq + %a = load <2 x double>* %ptr + %shuffle = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 0> + ret <2 x double> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 7d496fa19f15..53fb09e32428 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1,170 +1,1361 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0001 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,0,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0001: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0001: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0020 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,0,2,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0020: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0020: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> ret <4 x i32> %shuffle } +define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: shuffle_v4i32_0112: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0112: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2> + ret <4 x i32> %shuffle +} define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0300 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[0,3,0,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0300: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0300: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_1000 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[1,0,0,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_1000: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_1000: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_2200 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[2,2,0,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_2200: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_2200: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,0,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_3330 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[3,3,3,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_3330: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_3330: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_3210 -; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm0[3,2,1,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_3210: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_3210: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> ret <4 x i32> %shuffle } +define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: shuffle_v4i32_2121: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_2121: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,1] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1> + ret <4 x i32> %shuffle +} + define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_0001 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,0,0,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_0001: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_0001: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> ret <4 x float> %shuffle } define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_0020 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,0,2,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_0020: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_0020: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> ret <4 x float> %shuffle } define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_0300 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,3,0,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_0300: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_0300: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> ret <4 x float> %shuffle } define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_1000 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[1,0,0,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_1000: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_1000: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> ret <4 x float> %shuffle } define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_2200 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[2,2,0,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_2200: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_2200: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> ret <4 x float> %shuffle } define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_3330 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[3,3,3,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_3330: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_3330: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> ret <4 x float> %shuffle } define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4f32_3210 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[3,2,1,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4f32_3210: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_3210: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> ret <4 x float> %shuffle } +define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: shuffle_v4f32_0011: +; SSE: # BB#0: +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_0011: +; AVX: # BB#0: +; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: shuffle_v4f32_2233: +; SSE: # BB#0: +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_2233: +; AVX: # BB#0: +; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: shuffle_v4f32_0022: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_0022: +; SSE3: # BB#0: +; SSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_0022: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_0022: +; SSE41: # BB#0: +; SSE41-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_0022: +; AVX: # BB#0: +; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: shuffle_v4f32_1133: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_1133: +; SSE3: # BB#0: +; SSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_1133: +; SSSE3: # BB#0: +; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_1133: +; SSE41: # BB#0: +; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_1133: +; AVX: # BB#0: +; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3> + ret <4 x float> %shuffle +} define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0124 -; CHECK-SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0] -; CHECK-SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[2,0] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v4i32_0124: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_0124: +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_0124: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_0124: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0124: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0142 -; CHECK-SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[2,0] -; CHECK-SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,2] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0142: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0142: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0412 -; CHECK-SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[0,0] -; CHECK-SSE2-NEXT: shufps {{.*}} # xmm1 = xmm1[2,0],xmm0[1,2] -; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0412: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0412: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[1,2] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_4012 -; CHECK-SSE2: shufps {{.*}} # xmm1 = xmm1[0,0],xmm0[0,0] -; CHECK-SSE2-NEXT: shufps {{.*}} # xmm1 = xmm1[0,2],xmm0[1,2] -; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_4012: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_4012: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,2] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0145 -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0145: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0145: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_0451 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,1] -; CHECK-SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2,3,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_0451: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0451: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_4501 -; CHECK-SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[0] -; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_4501: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_4501: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> ret <4 x i32> %shuffle } define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) { -; CHECK-SSE2-LABEL: @shuffle_v4i32_4015 -; CHECK-SSE2: shufps {{.*}} # xmm0 = xmm0[0,1],xmm1[0,1] -; CHECK-SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[2,0,1,3] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v4i32_4015: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_4015: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,1,3] +; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5> ret <4 x i32> %shuffle } + +define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) { +; SSE2-LABEL: shuffle_v4f32_4zzz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_4zzz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_4zzz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_4zzz: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_4zzz: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) { +; SSE2-LABEL: shuffle_v4f32_z4zz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_z4zz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_z4zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_z4zz: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_z4zz: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) { +; SSE2-LABEL: shuffle_v4f32_zz4z: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_zz4z: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_zz4z: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_zz4z: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_zz4z: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) { +; SSE2-LABEL: shuffle_v4f32_zuu4: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_zuu4: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_zuu4: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_zuu4: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_zuu4: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) { +; SSE2-LABEL: shuffle_v4f32_zzz7: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_zzz7: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_zzz7: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_zzz7: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_zzz7: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) { +; SSE2-LABEL: shuffle_v4f32_z6zz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4f32_z6zz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4f32_z6zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4f32_z6zz: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4f32_z6zz: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3> + ret <4 x float> %shuffle +} + +define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) { +; SSE2-LABEL: shuffle_v4i32_4zzz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_4zzz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_4zzz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_4zzz: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_4zzz: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) { +; SSE2-LABEL: shuffle_v4i32_z4zz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_z4zz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_z4zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_z4zz: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_z4zz: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) { +; SSE2-LABEL: shuffle_v4i32_zz4z: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_zz4z: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_zz4z: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_zz4z: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_zz4z: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) { +; SSE-LABEL: shuffle_v4i32_zuu4: +; SSE: # BB#0: +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_zuu4: +; AVX: # BB#0: +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) { +; SSE2-LABEL: shuffle_v4i32_z6zz: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_z6zz: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_z6zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_z6zz: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_z6zz: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_7012: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_7012: +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_7012: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_7012: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_7012: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_6701: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_6701: +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_6701: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_6701: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_6701: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_5670: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_5670: +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0] +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_5670: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_5670: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_5670: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_1234: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_1234: +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_1234: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_1234: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_1234: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_2345: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_2345: +; SSE3: # BB#0: +; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_2345: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_2345: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_2345: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_3456: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_3456: +; SSE3: # BB#0: +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_3456: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_3456: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_3456: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: shuffle_v4i32_0u1u: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_0u1u: +; SSE3: # BB#0: +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_0u1u: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_0u1u: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxdq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0u1u: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxdq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef> + ret <4 x i32> %shuffle +} + +define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) { +; SSE2-LABEL: shuffle_v4i32_0z1z: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE3-LABEL: shuffle_v4i32_0z1z: +; SSE3: # BB#0: +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: shuffle_v4i32_0z1z: +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v4i32_0z1z: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxdq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_0z1z: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxdq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + ret <4 x i32> %shuffle +} + +define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) { +; SSE-LABEL: insert_reg_and_zero_v4i32: +; SSE: # BB#0: +; SSE-NEXT: movd %edi, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_and_zero_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: retq + %v = insertelement <4 x i32> undef, i32 %a, i32 0 + %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x i32> %shuffle +} + +define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) { +; SSE-LABEL: insert_mem_and_zero_v4i32: +; SSE: # BB#0: +; SSE-NEXT: movd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_and_zero_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovd (%rdi), %xmm0 +; AVX-NEXT: retq + %a = load i32* %ptr + %v = insertelement <4 x i32> undef, i32 %a, i32 0 + %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x i32> %shuffle +} + +define <4 x float> @insert_reg_and_zero_v4f32(float %a) { +; SSE2-LABEL: insert_reg_and_zero_v4f32: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_reg_and_zero_v4f32: +; SSE3: # BB#0: +; SSE3-NEXT: xorps %xmm1, %xmm1 +; SSE3-NEXT: movss %xmm0, %xmm1 +; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_reg_and_zero_v4f32: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_reg_and_zero_v4f32: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: insert_reg_and_zero_v4f32: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vmovss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %v = insertelement <4 x float> undef, float %a, i32 0 + %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) { +; SSE-LABEL: insert_mem_and_zero_v4f32: +; SSE: # BB#0: +; SSE-NEXT: movss (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_and_zero_v4f32: +; AVX: # BB#0: +; AVX-NEXT: vmovss (%rdi), %xmm0 +; AVX-NEXT: retq + %a = load float* %ptr + %v = insertelement <4 x float> undef, float %a, i32 0 + %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) { +; SSE2-LABEL: insert_reg_lo_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: movd %rdi, %xmm1 +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_reg_lo_v4i32: +; SSE3: # BB#0: +; SSE3-NEXT: movd %rdi, %xmm1 +; SSE3-NEXT: movsd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_reg_lo_v4i32: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %rdi, %xmm1 +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_reg_lo_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: movd %rdi, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: insert_reg_lo_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq %rdi, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_reg_lo_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq %rdi, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq + %a.cast = bitcast i64 %a to <2 x i32> + %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x i32> %shuffle +} + +define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) { +; SSE2-LABEL: insert_mem_lo_v4i32: +; SSE2: # BB#0: +; SSE2-NEXT: movlpd (%rdi), %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: insert_mem_lo_v4i32: +; SSE3: # BB#0: +; SSE3-NEXT: movlpd (%rdi), %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_mem_lo_v4i32: +; SSSE3: # BB#0: +; SSSE3-NEXT: movlpd (%rdi), %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_mem_lo_v4i32: +; SSE41: # BB#0: +; SSE41-NEXT: movq (%rdi), %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: insert_mem_lo_v4i32: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq (%rdi), %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_mem_lo_v4i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq + %a = load <2 x i32>* %ptr + %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x i32> %shuffle +} + +define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) { +; SSE-LABEL: insert_reg_hi_v4i32: +; SSE: # BB#0: +; SSE-NEXT: movd %rdi, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_hi_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovq %rdi, %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %a.cast = bitcast i64 %a to <2 x i32> + %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x i32> %shuffle +} + +define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) { +; SSE-LABEL: insert_mem_hi_v4i32: +; SSE: # BB#0: +; SSE-NEXT: movq (%rdi), %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_hi_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovq (%rdi), %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %a = load <2 x i32>* %ptr + %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x i32> %shuffle +} + +define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) { +; SSE-LABEL: insert_reg_lo_v4f32: +; SSE: # BB#0: +; SSE-NEXT: movsd %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_lo_v4f32: +; AVX: # BB#0: +; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %a.cast = bitcast double %a to <2 x float> + %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) { +; SSE-LABEL: insert_mem_lo_v4f32: +; SSE: # BB#0: +; SSE-NEXT: movlpd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_lo_v4f32: +; AVX: # BB#0: +; AVX-NEXT: vmovlpd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = load <2 x float>* %ptr + %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) { +; SSE-LABEL: insert_reg_hi_v4f32: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_reg_hi_v4f32: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq + %a.cast = bitcast double %a to <2 x float> + %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) { +; SSE-LABEL: insert_mem_hi_v4f32: +; SSE: # BB#0: +; SSE-NEXT: movhpd (%rdi), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: insert_mem_hi_v4f32: +; AVX: # BB#0: +; AVX-NEXT: vmovhpd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = load <2 x float>* %ptr + %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) { +; SSE-LABEL: shuffle_mem_v4f32_3210: +; SSE: # BB#0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_mem_v4f32_3210: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] +; AVX-NEXT: retq + %a = load <4 x float>* %ptr + %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x float> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 5d1922a34837..de25a16a2e00 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1,493 +1,1919 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" define <8 x i16> @shuffle_v8i16_01012323(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_01012323 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,0,1,1] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_01012323: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_01012323: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_67452301(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_67452301 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,2,1,0] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_67452301: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_67452301: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_456789AB -; CHECK-SSE2: # BB#0: -; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1],xmm1[0] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_456789AB: +; SSE2: # BB#0: +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_456789AB: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_456789AB: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_456789AB: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_00000000 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_00000000: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_00000000: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_00000000: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v8i16_00000000: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i16_00000000: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_00004444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_00004444: +; SSE: # BB#0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_00004444: +; AVX: # BB#0: +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } +define <8 x i16> @shuffle_v8i16_u0u1u2u3(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_u0u1u2u3: +; SSE: # BB#0: +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u0u1u2u3: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3> + ret <8 x i16> %shuffle +} +define <8 x i16> @shuffle_v8i16_u4u5u6u7(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_u4u5u6u7: +; SSE: # BB#0: +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u4u5u6u7: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7> + ret <8 x i16> %shuffle +} define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_31206745 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,3,2] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_31206745: +; SSE: # BB#0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_31206745: +; AVX: # BB#0: +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 5> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_44440000(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_44440000 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_44440000: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_44440000: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_44440000: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_44440000: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0> ret <8 x i16> %shuffle } +define <8 x i16> @shuffle_v8i16_23016745(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_23016745: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_23016745: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> + ret <8 x i16> %shuffle +} +define <8 x i16> @shuffle_v8i16_23026745(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_23026745: +; SSE: # BB#0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_23026745: +; AVX: # BB#0: +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 5> + ret <8 x i16> %shuffle +} +define <8 x i16> @shuffle_v8i16_23016747(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_23016747: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_23016747: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 7> + ret <8 x i16> %shuffle +} define <8 x i16> @shuffle_v8i16_75643120(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_75643120 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,0,1] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,1,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,6,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_75643120: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_75643120: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_75643120: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_75643120: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_10545410(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_10545410 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_10545410: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_10545410: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_10545410: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_10545410: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_54105410(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_54105410 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,4,7,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_54105410: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_54105410: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_54105410: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_54105410: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_54101054(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_54101054 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_54101054: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_54101054: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_54101054: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_54101054: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_04400440(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_04400440 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,4,4,6] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_04400440: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,4,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_04400440: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_04400440: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_04400440: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_40044004(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_40044004 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,0,0,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_40044004: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,0,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_40044004: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_40044004: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_40044004: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_26405173(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_26405173 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_26405173: +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_26405173: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_26405173: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_26405173: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_20645173(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_20645173 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,0,3,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,6,4,7] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_20645173: +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_20645173: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_20645173: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_20645173: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_26401375 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_26401375: +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_26401375: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_26401375: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_26401375: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 5> ret <8 x i16> %shuffle } +define <8 x i16> @shuffle_v8i16_66751643(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_66751643: +; SSE2: # BB#0: +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,3,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,4,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_66751643: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_66751643: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_66751643: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 3> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_60514754(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_60514754: +; SSE2: # BB#0: +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,5,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_60514754: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_60514754: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_60514754: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 6, i32 0, i32 5, i32 1, i32 4, i32 7, i32 5, i32 4> + ret <8 x i16> %shuffle +} + define <8 x i16> @shuffle_v8i16_00444444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_00444444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_00444444: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_00444444: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_00444444: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_00444444: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_44004444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_44004444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,2,0,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_44004444: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,0,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_44004444: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_44004444: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_44004444: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_04404444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_04404444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_04404444: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_04404444: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_04404444: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_04404444: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_04400000(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_04400000 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,0,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_04400000: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_04400000: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_04400000: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_04400000: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_04404567 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_04404567: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_04404567: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 7> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_0X444444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_0X444444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,2,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_0X444444: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0X444444: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0X444444: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0X444444: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_44X04444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_44X04444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,2,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_44X04444: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_44X04444: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_44X04444: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_44X04444: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_X4404444(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_X4404444 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_X4404444: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_X4404444: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_X4404444: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_X4404444: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_0127XXXX -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_0127XXXX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0127XXXX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0127XXXX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0127XXXX: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXX4563 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,0] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_XXXX4563: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_XXXX4563: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_XXXX4563: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_XXXX4563: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_4563XXXX -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,2,3] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_4563XXXX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_4563XXXX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_4563XXXX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_4563XXXX: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_01274563 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,1,3] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,5,4,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_01274563: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_01274563: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_01274563: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_01274563: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_45630127 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,1,2,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,0,1,3] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,6,7,5,4] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_45630127: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_45630127: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_45630127: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_45630127: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 7> ret <8 x i16> %shuffle } +define <8 x i16> @shuffle_v8i16_37102735(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_37102735: +; SSE2: # BB#0: +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,6] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_37102735: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_37102735: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_37102735: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 5> + ret <8 x i16> %shuffle +} + define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_08192a3b -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_08192a3b: +; SSE: # BB#0: +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_08192a3b: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d2e3f -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,3,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_0c1d2e3f: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0c1d2e3f: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 2, i32 14, i32 3, i32 15> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_4c5d6e7f -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_4c5d6e7f: +; SSE: # BB#0: +; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_4c5d6e7f: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_48596a7b -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_48596a7b: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_48596a7b: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 8, i32 5, i32 9, i32 6, i32 10, i32 7, i32 11> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_08196e7f -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_08196e7f: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_08196e7f: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 6, i32 14, i32 7, i32 15> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_0c1d6879 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,0,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_0c1d6879: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,0,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0c1d6879: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 12, i32 1, i32 13, i32 6, i32 8, i32 7, i32 9> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_109832ba -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm0[2,0,3,1,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,3,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[2,0,3,1,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklqdq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_109832ba: +; SSE: # BB#0: +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,3,1,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_109832ba: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[2,0,3,1,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_8091a2b3(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_8091a2b3 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: punpcklwd %xmm0, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_8091a2b3: +; SSE: # BB#0: +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_8091a2b3: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_c4d5e6f7 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm2 = xmm0[2,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_c4d5e6f7: +; SSE: # BB#0: +; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_c4d5e6f7: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_0213cedf -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,3,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,1,3,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklqdq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: shuffle_v8i16_0213cedf: +; SSE: # BB#0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0213cedf: +; AVX: # BB#0: +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15> ret <8 x i16> %shuffle } +define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_443aXXXX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_443aXXXX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_443aXXXX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_443aXXXX: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,12,13,10,11,12,13,10,11,12,13,14,15] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %shuffle +} + define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_032dXXXX -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,1,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,3,2,1,4,5,6,7] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_032dXXXX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_032dXXXX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_032dXXXX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_032dXXXX: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,6,7,8,9,12,13,12,13,14,15] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 3, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } -define <8 x i16> @shuffle_v8i16_XXXcXXXX(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXcXXXX -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm1[2,1,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,1,2,1,4,5,6,7] -; CHECK-SSE2-NEXT: retq +define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_XXXdXXXX: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_XXXdXXXX: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,2,3,3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_012dXXXX -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[2,1,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_012dXXXX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_012dXXXX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_012dXXXX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_012dXXXX: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_XXXXcde3 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1] -; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,2] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_XXXXcde3: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_XXXXcde3: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_XXXXcde3: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v8i16_XXXXcde3: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i16_XXXXcde3: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,8,9,14,15] +; AVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 3> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_cde3XXXX -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,2,1] -; CHECK-SSE2-NEXT: punpckhwd %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm1[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,7,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_cde3XXXX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_cde3XXXX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_cde3XXXX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v8i16_cde3XXXX: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i16_cde3XXXX: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; AVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 13, i32 14, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> ret <8 x i16> %shuffle } define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) { -; CHECK-SSE2-LABEL: @shuffle_v8i16_012dcde3 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm2 = xmm0[0,1,2,1] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm3 = xmm1[2,1,2,3] -; CHECK-SSE2-NEXT: punpckhwd %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm1 = xmm1[0,2,2,3,4,5,6,7] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,7,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpcklwd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[3,1,2,0] -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,6,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[2,1,2,3] -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,2,0,3,4,5,6,7] -; CHECK-SSE2-NEXT: punpcklqdq %xmm1, %xmm0 -; CHECK-SSE2-NEXT: retq +; SSE2-LABEL: shuffle_v8i16_012dcde3: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_012dcde3: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_012dcde3: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX1-LABEL: shuffle_v8i16_012dcde3: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i16_012dcde3: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm2 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,14,15,8,9,14,15,12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,8,9,0,1,0,1,2,3] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 12, i32 13, i32 14, i32 3> ret <8 x i16> %shuffle } + +define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_XXX1X579: +; SSE2: # BB#0: +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_XXX1X579: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_XXX1X579: +; SSE41: # BB#0: +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_XXX1X579: +; AVX: # BB#0: +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,10,11,14,15,14,15,10,11,12,13,14,15] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,8,9,8,9,12,13,6,7] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_XX4X8acX: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_XX4X8acX: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_XX4X8acX: +; SSE41: # BB#0: +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_XX4X8acX: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,4,5,8,9,0,1] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_8zzzzzzz: +; SSE: # BB#0: +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_8zzzzzzz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_z8zzzzzz: +; SSE: # BB#0: +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_z8zzzzzz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 2, i32 8, i32 3, i32 7, i32 6, i32 5, i32 4, i32 3> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_zzzzz8zz: +; SSE: # BB#0: +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zzzzz8zz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { +; SSE-LABEL: shuffle_v8i16_zuuzuuz8: +; SSE: # BB#0: +; SSE-NEXT: movd %edi, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zuuzuuz8: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 0 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 8> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) { +; SSE-LABEL: shuffle_v8i16_zzBzzzzz: +; SSE: # BB#0: +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_zzBzzzzz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: retq + %a = insertelement <8 x i16> undef, i16 %i, i32 3 + %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 11, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_def01234(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_def01234: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_def01234: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_def01234: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_def01234: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_ueuu123u(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_ueuu123u: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_ueuu123u: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_ueuu123u: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_ueuu123u: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_56701234(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_56701234: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_56701234: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_56701234: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_56701234: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_u6uu123u(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_u6uu123u: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_u6uu123u: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_u6uu123u: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u6uu123u: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_uuuu123u(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_uuuu123u: +; SSE: # BB#0: +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_uuuu123u: +; AVX: # BB#0: +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_bcdef012(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_bcdef012: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_bcdef012: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_bcdef012: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_bcdef012: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_ucdeuu1u(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_ucdeuu1u: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_ucdeuu1u: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_ucdeuu1u: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_ucdeuu1u: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 1, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_34567012(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_34567012: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_34567012: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_34567012: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_34567012: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_u456uu1u(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_u456uu1u: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_u456uu1u: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_u456uu1u: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u456uu1u: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 1, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_u456uuuu(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: shuffle_v8i16_u456uuuu: +; SSE: # BB#0: +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u456uuuu: +; AVX: # BB#0: +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_3456789a(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_3456789a: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_3456789a: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_3456789a: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_3456789a: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_u456uu9u(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_u456uu9u: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_u456uu9u: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_u456uu9u: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u456uu9u: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 9, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_56789abc(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_56789abc: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_56789abc: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_56789abc: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_56789abc: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_u6uu9abu(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: shuffle_v8i16_u6uu9abu: +; SSE2: # BB#0: +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_u6uu9abu: +; SSSE3: # BB#0: +; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_u6uu9abu: +; SSE41: # BB#0: +; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_u6uu9abu: +; AVX: # BB#0: +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9] +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_0uuu1uuu(<8 x i16> %a) { +; SSE2-LABEL: shuffle_v8i16_0uuu1uuu: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0uuu1uuu: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0uuu1uuu: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxwq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0uuu1uuu: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxwq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_0zzz1zzz(<8 x i16> %a) { +; SSE2-LABEL: shuffle_v8i16_0zzz1zzz: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0zzz1zzz: +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0zzz1zzz: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxwq %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0zzz1zzz: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxwq %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_0u1u2u3u(<8 x i16> %a) { +; SSE2-LABEL: shuffle_v8i16_0u1u2u3u: +; SSE2: # BB#0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0u1u2u3u: +; SSSE3: # BB#0: +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0u1u2u3u: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxwd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0u1u2u3u: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxwd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) { +; SSE2-LABEL: shuffle_v8i16_0z1z2z3z: +; SSE2: # BB#0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v8i16_0z1z2z3z: +; SSSE3: # BB#0: +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v8i16_0z1z2z3z: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxwd %xmm0, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: shuffle_v8i16_0z1z2z3z: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxwd %xmm0, %xmm0 +; AVX-NEXT: retq + %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> + ret <8 x i16> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll new file mode 100644 index 000000000000..7c38149a700c --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -0,0 +1,1364 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,4] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,2,3,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,4,5,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,6,7,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,10,11,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[14,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,7,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31: +; AVX1: # BB#0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31: +; AVX1: # BB#0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,0,1,u,u,0,1,u,u,0,1,u,u,16,17,u,u,16,17,u,u,16,17,u,u,16,17] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 24, i32 24, i32 24, i32 24, i32 12, i32 13, i32 14, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,14,15,12,13,10,11,8,9,u,u,u,u,u,u,u,u,30,31,28,29,26,27,24,25] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 7, i32 6, i32 5, i32 4, i32 27, i32 26, i32 25, i32 24, i32 15, i32 14, i32 13, i32 12> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 3, i32 2, i32 1, i32 0, i32 27, i32 26, i32 25, i32 24, i32 11, i32 10, i32 9, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 10, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 13, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 14, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 9, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 10, i32 8, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 13, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 14, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,8,9,8,9,4,5,4,5,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 14, i32 14, i32 12, i32 12, i32 10, i32 10, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 14, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 15> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,2,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,12,13,8,9,4,5,4,5,0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 2, i32 4, i32 4, i32 undef, i32 6, i32 14, i32 14, i32 undef, i32 12, i32 10, i32 10, i32 8, i32 8> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,3,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 12, i32 12> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: +; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: +; AVX1: # BB#0: +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28> + ret <16 x i16> %shuffle +} + +define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i16> %a, <16 x i16> %b) { +; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: +; AVX1: # BB#0: +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> + ret <16 x i16> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll new file mode 100644 index 000000000000..c7f4c3512fba --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -0,0 +1,1656 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: movl $15, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: movl $15, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],zero +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,1,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2],zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,2,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,3,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4],zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,4,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,5,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[6],zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,6,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[7],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,7,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,xmm2[8],zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,8,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[9],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,9,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[10],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,10,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,11,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,12,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,13,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[14],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: movl $128, %eax +; AVX1-NEXT: vmovd %eax, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-NEXT: movl $15, %eax +; AVX2-NEXT: vmovd %eax, %xmm2 +; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vinserti128 $0, %xmm2, %ymm3, %ymm2 +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15,23,23,23,23,23,23,23,23,31,31,31,31,31,31,31,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15,19,19,19,19,23,23,23,23,27,27,27,27,31,31,31,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15, i32 19, i32 19, i32 19, i32 19, i32 23, i32 23, i32 23, i32 23, i32 27, i32 27, i32 27, i32 27, i32 31, i32 31, i32 31, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15,17,17,19,19,21,21,23,23,25,25,27,27,29,29,31,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15, i32 17, i32 17, i32 19, i32 19, i32 21, i32 21, i32 23, i32 23, i32 25, i32 25, i32 27, i32 27, i32 29, i32 29, i32 31, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX1: # BB#0: +; AVX1-NEXT: movl $15, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2: # BB#0: +; AVX2-NEXT: movl $15, %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 33, i32 2, i32 35, i32 4, i32 37, i32 6, i32 39, i32 8, i32 41, i32 10, i32 43, i32 12, i32 45, i32 14, i32 47, i32 16, i32 49, i32 18, i32 51, i32 20, i32 53, i32 22, i32 55, i32 24, i32 57, i32 26, i32 59, i32 28, i32 61, i32 30, i32 63> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32: +; AVX1: # BB#0: +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,8,9,10,11,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,15,14,13,12,11,10,9,8] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u,31,30,29,28,27,26,25,24] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,7,6,5,4,3,2,1,0] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,18,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 18, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,30,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 30, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: movl $15, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 31, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,17,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 17, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,18,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 18, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,30,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31: +; AVX1: # BB#0: +; AVX1-NEXT: movl $15, %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,31] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 31> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,28,28,28,28,24,24,24,24,20,20,20,20,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 28, i32 28, i32 28, i32 28, i32 24, i32 24, i32 24, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,u,u,u,u,u,0,0,0,0,0,14,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,u,u,u,u,u,16,16,16,16,16,30,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,14,1,1,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,u,0,u,u,u,u,0,0,0,0,0,0,14,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,14,u,u,0,0,0,0,0,0,0,0,0,0,0,0,16,16,u,16,u,u,u,u,16,16,16,16,16,16,30,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 undef, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,u,u,u,4,u,8,8,8,8,u,u,12,u,28,28,28,28,u,u,u,24,20,20,20,20,16,16,16,16] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 undef, i32 28, i32 28, i32 28, i32 28, i32 undef, i32 undef, i32 undef, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24: +; AVX1: # BB#0: +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,8,8,9,9,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,u,u,u,u,u,u,u,u,16,16,16,u,u,u,u,u,u,u,24,24,24,24,24,24] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[u],zero,xmm0[u,u,u,u,u,u,u,7,u,u,u,u] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,3,u,3,u,u,u,u,u,u,u],zero,xmm3[u,u,u,u] +; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1],zero,xmm2[3],zero,zero,zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,4,u,1,6],zero,zero,xmm4[0],zero,xmm4[11,u],zero,zero,zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7] +; AVX1-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[2],zero,xmm5[4,5,6,7,8,9,10],zero,xmm5[12,13,14,15] +; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[u,u,u,u,1,6,13,u,u],zero,xmm3[u,u] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,12,13,u,u,u,u],zero,zero,zero,xmm0[u,u,12,u,u] +; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,3],zero,zero,zero,zero,xmm0[8,9,10],zero,zero,xmm0[13],zero,zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm4[u,u],zero,zero,xmm4[12],zero,xmm4[u,u,u],zero,zero,xmm4[u,0,3] +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,13,u,u,3,3],zero,xmm1[8,u,u,u,12,1,u],zero,zero +; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[4,5,6,7],zero,zero,zero,xmm1[11,12],zero,xmm1[14,15] +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,255,255,0,255,u,u,u,255,255,u,0,0,u,u,255,u,255,255,0,0,255,0,255,u,0,0,0,0> +; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255] +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 42, i32 45, i32 12, i32 13, i32 35, i32 35, i32 60, i32 40, i32 17, i32 22, i32 29, i32 44, i32 33, i32 12, i32 48, i32 51, i32 20, i32 19, i32 52, i32 19, i32 49, i32 54, i32 37, i32 32, i32 48, i32 42, i32 59, i32 7, i32 36, i32 34, i32 36, i32 39> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56> + ret <32 x i8> %shuffle +} + +define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47(<32 x i8> %a, <32 x i8> %b) { +; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: +; AVX1: # BB#0: +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47> + ret <32 x i8> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll new file mode 100644 index 000000000000..bca7fb7a276d --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -0,0 +1,888 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0000: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0000: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0001: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0001: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0020: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0020: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0300: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0300: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_1000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_1000: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_2200: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_2200: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_3330: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_3330: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_3210: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_3210: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0023: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0022: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_1032: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 3, i32 2> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_1133: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_1023: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_1022: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 2> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0423: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0423: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0462: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 6, i32 2> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0426: +; ALL: # BB#0: +; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1537(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_1537: +; ALL: # BB#0: +; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_4062(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_4062: +; ALL: # BB#0: +; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_5173(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_5173: +; ALL: # BB#0: +; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 7, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_5163: +; ALL: # BB#0: +; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[2],ymm0[3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 6, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0527: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_4163: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0145: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_4501: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_0167: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_1054: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_3254: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_3276: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) { +; ALL-LABEL: shuffle_v4f64_1076: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6> + ret <4 x double> %shuffle +} + +define <4 x double> @shuffle_v4f64_0415(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: shuffle_v4f64_0415: +; AVX1: # BB#0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4f64_0415: +; AVX2: # BB#0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + ret <4 x double> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0000: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0000: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0001: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0001: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0020: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0020: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0112: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0112: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0300: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0300: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_1000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_1000: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_2200: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_2200: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_3330: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_3330: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_3210: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_3210: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0124: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0124: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0142: +; AVX1: # BB#0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0142: +; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0412: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0412: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] +; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_4012: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_4012: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) { +; ALL-LABEL: shuffle_v4i64_0145: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0451: +; AVX1: # BB#0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0451: +; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) { +; ALL-LABEL: shuffle_v4i64_4501: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_4015: +; AVX1: # BB#0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_4015: +; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_2u35: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_2u35: +; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_1251: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[0],ymm0[2],ymm2[3] +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_1251: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_1054: +; AVX1: # BB#0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_1054: +; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_3254: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_3254: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 5, i32 4> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_3276: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_3276: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 7, i32 6> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_1076: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_1076: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6> + ret <4 x i64> %shuffle +} + +define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: shuffle_v4i64_0415: +; AVX1: # BB#0: +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1] +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_0415: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5> + ret <4 x i64> %shuffle +} + +define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { +; AVX1-LABEL: stress_test1: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm0[1,0,3,2] +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: stress_test1: +; AVX2: # BB#0: +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm1[3,1,1,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,1,3] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; AVX2-NEXT: retq + %c = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 0> + %d = shufflevector <4 x i64> %c, <4 x i64> undef, <4 x i32> <i32 3, i32 undef, i32 2, i32 undef> + %e = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 undef> + %f = shufflevector <4 x i64> %d, <4 x i64> %e, <4 x i32> <i32 5, i32 1, i32 1, i32 0> + + ret <4 x i64> %f +} + +define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) { +; AVX1-LABEL: insert_reg_and_zero_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_reg_and_zero_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq %rdi, %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: retq + %v = insertelement <4 x i64> undef, i64 %a, i64 0 + %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x i64> %shuffle +} + +define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { +; AVX1-LABEL: insert_mem_and_zero_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq (%rdi), %xmm0 +; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_mem_and_zero_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovq (%rdi), %xmm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX2-NEXT: retq + %a = load i64* %ptr + %v = insertelement <4 x i64> undef, i64 %a, i64 0 + %shuffle = shufflevector <4 x i64> %v, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x i64> %shuffle +} + +define <4 x double> @insert_reg_and_zero_v4f64(double %a) { +; ALL-LABEL: insert_reg_and_zero_v4f64: +; ALL: # BB#0: +; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm0 +; ALL-NEXT: retq + %v = insertelement <4 x double> undef, double %a, i32 0 + %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x double> %shuffle +} + +define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) { +; ALL-LABEL: insert_mem_and_zero_v4f64: +; ALL: # BB#0: +; ALL-NEXT: vmovsd (%rdi), %xmm0 +; ALL-NEXT: retq + %a = load double* %ptr + %v = insertelement <4 x double> undef, double %a, i32 0 + %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x double> %shuffle +} + +define <4 x double> @splat_mem_v4f64(double* %ptr) { +; ALL-LABEL: splat_mem_v4f64: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 +; ALL-NEXT: retq + %a = load double* %ptr + %v = insertelement <4 x double> undef, double %a, i32 0 + %shuffle = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0> + ret <4 x double> %shuffle +} + +define <4 x i64> @splat_mem_v4i64(i64* %ptr) { +; AVX1-LABEL: splat_mem_v4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vmovddup (%rdi), %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_mem_v4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX2-NEXT: retq + %a = load i64* %ptr + %v = insertelement <4 x i64> undef, i64 %a, i64 0 + %shuffle = shufflevector <4 x i64> %v, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0> + ret <4 x i64> %shuffle +} + +define <4 x double> @splat_mem_v4f64_2(double* %p) { +; ALL-LABEL: splat_mem_v4f64_2: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 +; ALL-NEXT: retq + %1 = load double* %p + %2 = insertelement <2 x double> undef, double %1, i32 0 + %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %3 +} + +define <4 x double> @splat_v4f64(<2 x double> %r) { +; AVX1-LABEL: splat_v4f64: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_v4f64: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: retq + %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer + ret <4 x double> %1 +} diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll new file mode 100644 index 000000000000..77903da35583 --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -0,0 +1,1851 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00000000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00000000: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00000010: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00000010: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00000200: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00000200: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00003000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00003000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00040000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00040000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00500000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,1,u,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00500000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_06000000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,u,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,0,4,5,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_06000000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_70000000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,u,u,u,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_70000000: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: movl $7, %eax +; AVX2-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vinserti128 $0, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_01014545: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00112233: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00112233: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_00001111: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_00001111: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_81a3c5e7(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_81a3c5e7: +; ALL: # BB#0: +; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_08080808: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_08080808: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastss %xmm1, %ymm1 +; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_08084c4c: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_8823cc67: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_9832dc76: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_9810dc54(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_9810dc54: +; ALL: # BB#0: +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_08194c5d(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_08194c5d: +; ALL: # BB#0: +; ALL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_2a3b6e7f(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_2a3b6e7f: +; ALL: # BB#0: +; ALL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_08192a3b: +; AVX1: # BB#0: +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_08192a3b: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,0,u,1,u,2,u,3> +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_08991abb: +; AVX1: # BB#0: +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_08991abb: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3> +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_091b2d3f: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_091b2d3f: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_09ab1def: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_09ab1def: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00014445(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00014445: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00204464(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00204464: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_03004744(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_03004744: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_10005444(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_10005444: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_22006644(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_22006644: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_33307774(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_33307774: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_32107654(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_32107654: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00234467(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00234467: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00224466(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00224466: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_10325476(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_10325476: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_11335577(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_11335577: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_10235467(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_10235467: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_10225466: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00015444(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00015444: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00204644(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00204644: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_03004474(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_03004474: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_10004444(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_10004444: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_22006446(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_22006446: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_33307474(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_33307474: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_32104567(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_32104567: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00236744(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00236744: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00226644(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00226644: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_10324567(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_10324567: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_11334567(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_11334567: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_01235467(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_01235467: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_01235466(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_01235466: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_002u6u44(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_002u6u44: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00uu66uu(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_00uu66uu: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_103245uu(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_103245uu: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_1133uu67(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_1133uu67: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_0uu354uu(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_0uu354uu: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_uuu3uu66(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_uuu3uu66: +; ALL: # BB#0: +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_c348cda0: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[0,0],ymm0[4,7],ymm2[4,4] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_c348cda0: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,3,4,u,u,u,u,0> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u> +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_f511235a: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[0,0,3,2] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,1,4,5,5,5] +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_f511235a: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <7,u,u,u,u,u,u,2> +; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,5,1,1,2,3,5,u> +; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_32103210: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_32103210: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_76547654: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_76547654: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) { +; AVX1-LABEL: shuffle_v8f32_76543210: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8f32_76543210: +; AVX2: # BB#0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_3210ba98: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_3210fedc: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_7654fedc: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_fedc7654: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_ba987654: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4> + ret <8 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) { +; ALL-LABEL: shuffle_v8f32_ba983210: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4> + ret <8 x float> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00000000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00000000: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00000010: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00000010: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00000200: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00000200: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00003000: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00003000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00040000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,3,4,4,4,7] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00040000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00500000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,1,u,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00500000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_06000000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,u,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,0,4,5,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_06000000: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_70000000: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,u,u,u,4,4,4,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_70000000: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: movl $7, %eax +; AVX2-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vinserti128 $0, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_01014545: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_01014545: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00112233: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00112233: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00001111: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00001111: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_81a3c5e7(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_81a3c5e7: +; AVX1: # BB#0: +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_81a3c5e7: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_08080808: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,2,0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_08080808: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_08084c4c: +; AVX1: # BB#0: +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_08084c4c: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,0,4,4,6,4] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_8823cc67(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_8823cc67: +; AVX1: # BB#0: +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_8823cc67: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_9832dc76(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_9832dc76: +; AVX1: # BB#0: +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_9832dc76: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,3,5,4,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_9810dc54(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_9810dc54: +; AVX1: # BB#0: +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_9810dc54: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,0,4,5,5,4] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,3,5,4,6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_08194c5d(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_08194c5d: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_08194c5d: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_2a3b6e7f(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_2a3b6e7f: +; AVX1: # BB#0: +; AVX1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_2a3b6e7f: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_08192a3b: +; AVX1: # BB#0: +; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_08192a3b: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,u,1,u,2,u,3> +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_08991abb: +; AVX1: # BB#0: +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,0],xmm1[2,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_08991abb: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3> +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_091b2d3f: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_091b2d3f: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_09ab1def: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_09ab1def: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00014445(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00014445: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00014445: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00204464(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00204464: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00204464: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_03004744(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_03004744: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_03004744: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_10005444(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_10005444: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_10005444: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_22006644(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_22006644: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_22006644: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_33307774(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_33307774: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_33307774: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_32107654(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_32107654: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_32107654: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00234467(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00234467: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00234467: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00224466(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00224466: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00224466: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_10325476(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_10325476: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_10325476: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_11335577(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_11335577: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_11335577: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_10235467(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_10235467: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_10235467: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_10225466(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_10225466: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_10225466: +; AVX2: # BB#0: +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00015444: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00015444: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00204644: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00204644: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_03004474: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_03004474: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_10004444: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_10004444: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_22006446: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_22006446: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_33307474: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_33307474: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_32104567: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_32104567: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00236744: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00236744: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00226644: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00226644: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_10324567: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_10324567: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_11334567: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_11334567: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_01235467: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_01235467: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_01235466: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_01235466: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_002u6u44: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_002u6u44: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_00uu66uu: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_00uu66uu: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_103245uu: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_103245uu: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_1133uu67: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_1133uu67: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_0uu354uu: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_0uu354uu: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_uuu3uu66: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_uuu3uu66: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6> +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_6caa87e5: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[2,2],ymm2[4,4],ymm1[6,6] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_6caa87e5: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,4,2,2,0,u,6,u> +; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,3,2] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_32103210: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_32103210: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_76547654: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_76547654: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_76543210: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_76543210: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_3210ba98: +; AVX1: # BB#0: +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_3210ba98: +; AVX2: # BB#0: +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_3210fedc: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_3210fedc: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_7654fedc: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_7654fedc: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_fedc7654: +; AVX1: # BB#0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_fedc7654: +; AVX2: # BB#0: +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_ba987654: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_ba987654: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) { +; AVX1-LABEL: shuffle_v8i32_ba983210: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v8i32_ba983210: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq + %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4> + ret <8 x i32> %shuffle +} + +define <8 x float> @splat_mem_v8f32_2(float* %p) { +; ALL-LABEL: splat_mem_v8f32_2: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastss (%rdi), %ymm0 +; ALL-NEXT: retq + %1 = load float* %p + %2 = insertelement <4 x float> undef, float %1, i32 0 + %3 = shufflevector <4 x float> %2, <4 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %3 +} + +define <8 x float> @splat_v8f32(<4 x float> %r) { +; AVX1-LABEL: splat_v8f32: +; AVX1: # BB#0: +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splat_v8f32: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2-NEXT: retq + %1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer + ret <8 x float> %1 +} diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll new file mode 100644 index 000000000000..713ef7835cde --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -0,0 +1,40 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW + +target triple = "x86_64-unknown-unknown" + +define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x float> %a, <16 x float> %b) { +; ALL-LABEL: shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d: +; ALL: # BB#0: +; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; ALL-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> + ret <16 x float> %shuffle +} + +define <16 x i32> @shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x i32> %a, <16 x i32> %b) { +; ALL-LABEL: shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d: +; ALL: # BB#0: +; ALL-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> + ret <16 x i32> %shuffle +} + +define <16 x float> @shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x float> %a, <16 x float> %b) { +; ALL-LABEL: shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f: +; ALL: # BB#0: +; ALL-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; ALL-NEXT: retq + %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> + ret <16 x float> %shuffle +} + +define <16 x i32> @shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x i32> %a, <16 x i32> %b) { +; ALL-LABEL: shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f: +; ALL: # BB#0: +; ALL-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; ALL-NEXT: retq + %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> + ret <16 x i32> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll new file mode 100644 index 000000000000..fd4c9cb72bad --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -0,0 +1,1456 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW + +target triple = "x86_64-unknown-unknown" + +define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00000000: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00000010: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00000200: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00003000: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00040000: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00500000: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_06000000: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_70000000: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_01014545: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00112233: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00001111: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_81a3c5e7: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_08080808: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_08084c4c: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3] +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_8823cc67: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] +; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_9832dc76: +; ALL: # BB#0: +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_9810dc54: +; ALL: # BB#0: +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 +; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_08194c5d: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_2a3b6e7f: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_08192a3b: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,2,2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_08991abb: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm0[1,0,2,2] +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,2,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3] +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_091b2d3f: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_09ab1def: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00014445: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00204464: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_03004744: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_10005444: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_22006644: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_33307774: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_32107654: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00234467: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00224466: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_10325476: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_11335577: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_10235467: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_10225466: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00015444: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00204644: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_03004474: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_10004444: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_22006446: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_33307474: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,3,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_32104567: +; ALL: # BB#0: +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00236744: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00226644: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_10324567: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_11334567: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_01235467: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_01235466: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_002u6u44: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,0] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_00uu66uu: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_103245uu: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_1133uu67: +; ALL: # BB#0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_0uu354uu: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_uuu3uu66: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,2,2] +; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_c348cda0: +; ALL: # BB#0: +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[0,1],ymm2[0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vbroadcastsd %xmm1, %ymm4 +; ALL-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3] +; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3] +; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] +; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0> + ret <8 x double> %shuffle +} + +define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_f511235a: +; ALL: # BB#0: +; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[0,1,1,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm4 = ymm1[0,0,2,2] +; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3] +; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3] +; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10> + ret <8 x double> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00000000: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq %xmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00000010: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00000200: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00003000: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00040000: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00500000: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_06000000: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_70000000: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,2,3] +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_01014545: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00112233: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00001111: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_81a3c5e7: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_08080808: +; ALL: # BB#0: +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_08084c4c: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_8823cc67: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_9832dc76: +; ALL: # BB#0: +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_9810dc54: +; ALL: # BB#0: +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2 +; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_08194c5d: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_2a3b6e7f: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_08192a3b: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,2,3] +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_08991abb: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_091b2d3f: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_09ab1def: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00014445: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00204464: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_03004744: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_10005444: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_22006644: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_33307774: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_32107654: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00234467: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00224466: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_10325476: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_11335577: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_10235467: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_10225466: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,2] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,2] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00015444: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00204644: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_03004474: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,3,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_10004444: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_22006446: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,0,2] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_33307474: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,3,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_32104567: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00236744: +; ALL: # BB#0: +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00226644: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_10324567: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_11334567: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_01235467: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_01235466: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,2] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_002u6u44: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,0,0] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_00uu66uu: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_103245uu: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_1133uu67: +; ALL: # BB#0: +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_0uu354uu: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_uuu3uu66: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] +; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> + ret <8 x i64> %shuffle +} + +define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_6caa87e5: +; ALL: # BB#0: +; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; ALL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1,0,1] +; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; ALL-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7] +; ALL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] +; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5> + ret <8 x i64> %shuffle +} + +define <8 x double> @shuffle_v8f64_082a4c6e(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_082a4c6e: +; ALL: # BB#0: +; ALL-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32><i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + ret <8 x double> %shuffle +} + +define <8 x i64> @shuffle_v8i64_082a4c6e(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_082a4c6e: +; ALL: # BB#0: +; ALL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32><i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> + ret <8 x i64> %shuffle +} + +define <8 x double> @shuffle_v8f64_193b5d7f(<8 x double> %a, <8 x double> %b) { +; ALL-LABEL: shuffle_v8f64_193b5d7f: +; ALL: # BB#0: +; ALL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + ret <8 x double> %shuffle +} + +define <8 x i64> @shuffle_v8i64_193b5d7f(<8 x i64> %a, <8 x i64> %b) { +; ALL-LABEL: shuffle_v8i64_193b5d7f: +; ALL: # BB#0: +; ALL-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; ALL-NEXT: retq + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + ret <8 x i64> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index e60ecb70dec6..4e2bf87fdf64 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1,6 +1,14 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 +; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; +; Verify that the DAG combiner correctly folds bitwise operations across +; shuffles, nested shuffles with undef, pairs of nested shuffles, and other +; basic and always-safe patterns. Also test that the DAG combiner will combine +; target-specific shuffle instructions where reasonable. -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) @@ -8,57 +16,72 @@ declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) define <4 x i32> @combine_pshufd1(<4 x i32> %a) { -; CHECK-SSE2-LABEL: @combine_pshufd1 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: retq - %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) - %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) +; ALL-LABEL: combine_pshufd1: +; ALL: # BB#0: # %entry +; ALL-NEXT: retq +entry: + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) + %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) ret <4 x i32> %c } define <4 x i32> @combine_pshufd2(<4 x i32> %a) { -; CHECK-SSE2-LABEL: @combine_pshufd2 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: retq - %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) +; ALL-LABEL: combine_pshufd2: +; ALL: # BB#0: # %entry +; ALL-NEXT: retq +entry: + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) %b.cast = bitcast <4 x i32> %b to <8 x i16> %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28) %c.cast = bitcast <8 x i16> %c to <4 x i32> - %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) ret <4 x i32> %d } define <4 x i32> @combine_pshufd3(<4 x i32> %a) { -; CHECK-SSE2-LABEL: @combine_pshufd3 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: retq - %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) +; ALL-LABEL: combine_pshufd3: +; ALL: # BB#0: # %entry +; ALL-NEXT: retq +entry: + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) %b.cast = bitcast <4 x i32> %b to <8 x i16> %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28) %c.cast = bitcast <8 x i16> %c to <4 x i32> - %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) ret <4 x i32> %d } define <4 x i32> @combine_pshufd4(<4 x i32> %a) { -; CHECK-SSE2-LABEL: @combine_pshufd4 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: retq - %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) +; SSE-LABEL: combine_pshufd4: +; SSE: # BB#0: # %entry +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_pshufd4: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; AVX-NEXT: retq +entry: + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) %b.cast = bitcast <4 x i32> %b to <8 x i16> %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27) %c.cast = bitcast <8 x i16> %c to <4 x i32> - %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) ret <4 x i32> %d } define <4 x i32> @combine_pshufd5(<4 x i32> %a) { -; CHECK-SSE2-LABEL: @combine_pshufd5 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: retq - %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) +; SSE-LABEL: combine_pshufd5: +; SSE: # BB#0: # %entry +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_pshufd5: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; AVX-NEXT: retq +entry: + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) %b.cast = bitcast <4 x i32> %b to <8 x i16> %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27) %c.cast = bitcast <8 x i16> %c to <4 x i32> @@ -67,53 +90,2474 @@ define <4 x i32> @combine_pshufd5(<4 x i32> %a) { } define <4 x i32> @combine_pshufd6(<4 x i32> %a) { -; CHECK-SSE2-LABEL: @combine_pshufd6 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufd $0 -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: combine_pshufd6: +; SSE: # BB#0: # %entry +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_pshufd6: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: retq +entry: %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0) %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8) ret <4 x i32> %c } define <8 x i16> @combine_pshuflw1(<8 x i16> %a) { -; CHECK-SSE2-LABEL: @combine_pshuflw1 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: retq - %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) - %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) +; ALL-LABEL: combine_pshuflw1: +; ALL: # BB#0: # %entry +; ALL-NEXT: retq +entry: + %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) + %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) ret <8 x i16> %c } define <8 x i16> @combine_pshuflw2(<8 x i16> %a) { -; CHECK-SSE2-LABEL: @combine_pshuflw2 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: retq +; ALL-LABEL: combine_pshuflw2: +; ALL: # BB#0: # %entry +; ALL-NEXT: retq +entry: %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) - %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) - %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) + %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) + %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) ret <8 x i16> %d } define <8 x i16> @combine_pshuflw3(<8 x i16> %a) { -; CHECK-SSE2-LABEL: @combine_pshuflw3 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: combine_pshuflw3: +; SSE: # BB#0: # %entry +; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_pshuflw3: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; AVX-NEXT: retq +entry: %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) - %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) - %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) + %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) + %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) ret <8 x i16> %d } define <8 x i16> @combine_pshufhw1(<8 x i16> %a) { -; CHECK-SSE2-LABEL: @combine_pshufhw1 -; CHECK-SSE2: # BB#0: -; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] -; CHECK-SSE2-NEXT: retq +; SSE-LABEL: combine_pshufhw1: +; SSE: # BB#0: # %entry +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_pshufhw1: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; AVX-NEXT: retq +entry: %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) - %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) - %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) + %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) + %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) ret <8 x i16> %d } +define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test1: +; SSE: # BB#0: +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test1: +; AVX: # BB#0: +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} + +define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test2: +; SSE: # BB#0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test2: +; AVX: # BB#0: +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} + +define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test3: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test3: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} + +define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test4: +; SSE: # BB#0: +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test4: +; AVX: # BB#0: +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} + +define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test5: +; SSE: # BB#0: +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test5: +; AVX: # BB#0: +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} + +define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test6: +; SSE: # BB#0: +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test6: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} + + +; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles +; are not performing a swizzle operations. + +define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE2-LABEL: combine_bitwise_ops_test1b: +; SSE2: # BB#0: +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_bitwise_ops_test1b: +; SSSE3: # BB#0: +; SSSE3-NEXT: andps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_bitwise_ops_test1b: +; SSE41: # BB#0: +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_bitwise_ops_test1b: +; AVX1: # BB#0: +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test1b: +; AVX2: # BB#0: +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} + +define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE2-LABEL: combine_bitwise_ops_test2b: +; SSE2: # BB#0: +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_bitwise_ops_test2b: +; SSSE3: # BB#0: +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_bitwise_ops_test2b: +; SSE41: # BB#0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_bitwise_ops_test2b: +; AVX1: # BB#0: +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test2b: +; AVX2: # BB#0: +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] +; AVX2-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} + +define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE2-LABEL: combine_bitwise_ops_test3b: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_bitwise_ops_test3b: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm0 +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_bitwise_ops_test3b: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_bitwise_ops_test3b: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test3b: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} + +define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE2-LABEL: combine_bitwise_ops_test4b: +; SSE2: # BB#0: +; SSE2-NEXT: andps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_bitwise_ops_test4b: +; SSSE3: # BB#0: +; SSSE3-NEXT: andps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_bitwise_ops_test4b: +; SSE41: # BB#0: +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_bitwise_ops_test4b: +; AVX1: # BB#0: +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test4b: +; AVX2: # BB#0: +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX2-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} + +define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE2-LABEL: combine_bitwise_ops_test5b: +; SSE2: # BB#0: +; SSE2-NEXT: orps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_bitwise_ops_test5b: +; SSSE3: # BB#0: +; SSSE3-NEXT: orps %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_bitwise_ops_test5b: +; SSE41: # BB#0: +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_bitwise_ops_test5b: +; AVX1: # BB#0: +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test5b: +; AVX2: # BB#0: +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; AVX2-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} + +define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE2-LABEL: combine_bitwise_ops_test6b: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_bitwise_ops_test6b: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm0 +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_bitwise_ops_test6b: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_bitwise_ops_test6b: +; AVX1: # BB#0: +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_bitwise_ops_test6b: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} + +define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test1c: +; SSE: # BB#0: +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test1c: +; AVX: # BB#0: +; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} + +define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test2c: +; SSE: # BB#0: +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test2c: +; AVX: # BB#0: +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} + +define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE2-LABEL: combine_bitwise_ops_test3c: +; SSE2: # BB#0: +; SSE2-NEXT: xorps %xmm1, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_bitwise_ops_test3c: +; SSSE3: # BB#0: +; SSSE3-NEXT: xorps %xmm1, %xmm0 +; SSSE3-NEXT: xorps %xmm1, %xmm1 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_bitwise_ops_test3c: +; SSE41: # BB#0: +; SSE41-NEXT: xorps %xmm1, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test3c: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} + +define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test4c: +; SSE: # BB#0: +; SSE-NEXT: andps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test4c: +; AVX: # BB#0: +; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %and = and <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %and +} + +define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test5c: +; SSE: # BB#0: +; SSE-NEXT: orps %xmm1, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test5c: +; AVX: # BB#0: +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %or = or <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %or +} + +define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; SSE-LABEL: combine_bitwise_ops_test6c: +; SSE: # BB#0: +; SSE-NEXT: xorps %xmm1, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_bitwise_ops_test6c: +; AVX: # BB#0: +; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,3] +; AVX-NEXT: retq + %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> + %xor = xor <4 x i32> %shuf1, %shuf2 + ret <4 x i32> %xor +} + +define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test1: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test1: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test2: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test2: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test3: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test3: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test4: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_nested_undef_test4: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test4: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test5: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test5: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test6: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test6: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test7: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test7: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test8: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test8: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test9: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test9: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test10: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test10: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test11: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test11: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test12: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_nested_undef_test12: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test12: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4> + ret <4 x i32> %2 +} + +; The following pair of shuffles is folded into vector %A. +define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) { +; ALL-LABEL: combine_nested_undef_test13: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4> + ret <4 x i32> %2 +} + +; The following pair of shuffles is folded into vector %B. +define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test14: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test14: +; AVX: # BB#0: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4> + ret <4 x i32> %2 +} + + +; Verify that we don't optimize the following cases. We expect more than one shuffle. +; +; FIXME: Many of these already don't make sense, and the rest should stop +; making sense with th enew vector shuffle lowering. Revisit at least testing for +; it. + +define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test15: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test15: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[3,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { +; SSE2-LABEL: combine_nested_undef_test16: +; SSE2: # BB#0: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_nested_undef_test16: +; SSSE3: # BB#0: +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_nested_undef_test16: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_nested_undef_test16: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test16: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test17: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test17: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test18: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test18: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test19: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,0,0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test19: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,0,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test20: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test20: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test21: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[3,1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test21: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[3,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + + +; Test that we correctly combine shuffles according to rule +; shuffle(shuffle(x, y), undef) -> shuffle(y, undef) + +define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test22: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test22: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test23: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test23: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test24: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test24: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test25: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_nested_undef_test25: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test25: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test26: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test26: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test27: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_nested_undef_test27: +; AVX1: # BB#0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_nested_undef_test27: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) { +; SSE-LABEL: combine_nested_undef_test28: +; SSE: # BB#0: +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_nested_undef_test28: +; AVX: # BB#0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2> + ret <4 x i32> %2 +} + +define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test1: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test1: +; AVX: # BB#0: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test2: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test2: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test3: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test3: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test4: +; SSE: # BB#0: +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test4: +; AVX: # BB#0: +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test5: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test5: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test5: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test5: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x float> %2 +} + +define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test6: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test6: +; AVX: # BB#0: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test7: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test7: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test7: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test7: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test7: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test8: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test8: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test9: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test9: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test10: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test10: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm1, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSSE3-NEXT: movaps %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test10: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test10: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test10: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x i32> %2 +} + +define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) { +; ALL-LABEL: combine_test11: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test12: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test12: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test12: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test12: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test13: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test13: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test14: +; SSE: # BB#0: +; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test14: +; AVX: # BB#0: +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test15: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test15: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm0, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test15: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test15: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + ret <4 x float> %2 +} + +define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) { +; ALL-LABEL: combine_test16: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test17: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test17: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test17: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test17: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test17: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test18: +; SSE: # BB#0: +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test18: +; AVX: # BB#0: +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test19: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test19: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { +; SSE2-LABEL: combine_test20: +; SSE2: # BB#0: +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test20: +; SSSE3: # BB#0: +; SSSE3-NEXT: movaps %xmm0, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test20: +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test20: +; AVX1: # BB#0: +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test20: +; AVX2: # BB#0: +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> + %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) { +; SSE-LABEL: combine_test21: +; SSE: # BB#0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movdqa %xmm2, +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_test21: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: movdqa %xmm2, +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test21: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX2-NEXT: movdqa %xmm2, +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> + %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7> + store <4 x i32> %1, <4 x i32>* %ptr, align 16 + ret <4 x i32> %2 +} + +define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) { +; SSE-LABEL: combine_test22: +; SSE: # BB#0: +; SSE-NEXT: movq (%rdi), %xmm0 +; SSE-NEXT: movhpd (%rsi), %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_test22: +; AVX1: # BB#0: +; AVX1-NEXT: vmovq (%rdi), %xmm0 +; AVX1-NEXT: vmovhpd (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; Current AVX2 lowering of this is still awful, not adding a test case. + %1 = load <2 x float>* %a, align 8 + %2 = load <2 x float>* %b, align 8 + %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x float> %3 +} + +; Check some negative cases. +; FIXME: Do any of these really make sense? Are they redundant with the above tests? + +define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test1b: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test1b: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0> + ret <4 x float> %2 +} + +define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_test2b: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2b: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] +; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2b: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0,0] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test2b: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test3b: +; SSE: # BB#0: +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test3b: +; AVX: # BB#0: +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,0],xmm0[3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_test4b: +; SSE: # BB#0: +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test4b: +; AVX: # BB#0: +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7> + ret <4 x float> %2 +} + + +; Verify that we correctly fold shuffles even when we use illegal vector types. + +define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test1c: +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd (%rsi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movss %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test1c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd (%rsi), %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movss %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test1c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test1c: +; AVX1: # BB#0: +; AVX1-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test1c: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX2-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX2-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3> + ret <4 x i8> %2 +} + +define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test2c: +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movd (%rsi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test2c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movd (%rsi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test2c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm0 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm1 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test2c: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1> + ret <4 x i8> %2 +} + +define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test3c: +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd (%rsi), %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test3c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd (%rsi), %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test3c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm0 +; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_test3c: +; AVX: # BB#0: +; AVX-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x i8> %2 +} + +define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) { +; SSE2-LABEL: combine_test4c: +; SSE2: # BB#0: +; SSE2-NEXT: movd (%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd (%rsi), %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_test4c: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd (%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd (%rsi), %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_test4c: +; SSE41: # BB#0: +; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbd (%rsi), %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: combine_test4c: +; AVX1: # BB#0: +; AVX1-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test4c: +; AVX2: # BB#0: +; AVX2-NEXT: vpmovzxbd (%rdi), %xmm0 +; AVX2-NEXT: vpmovzxbd (%rsi), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-NEXT: retq + %A = load <4 x i8>* %a + %B = load <4 x i8>* %b + %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> + %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x i8> %2 +} + + +; The following test cases are generated from this C++ code +; +;__m128 blend_01(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<0 ); +; s = _mm_blend_ps( s, b, 1<<1 ); +; return s; +;} +; +;__m128 blend_02(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<0 ); +; s = _mm_blend_ps( s, b, 1<<2 ); +; return s; +;} +; +;__m128 blend_123(__m128 a, __m128 b) +;{ +; __m128 s = a; +; s = _mm_blend_ps( s, b, 1<<1 ); +; s = _mm_blend_ps( s, b, 1<<2 ); +; s = _mm_blend_ps( s, b, 1<<3 ); +; return s; +;} + +; Ideally, we should collapse the following shuffles into a single one. + +define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_blend_01: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_blend_01: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_blend_01: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_blend_01: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> + ret <4 x float> %shuffle6 +} + +define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_blend_02: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_blend_02: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm1, %xmm0 +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_blend_02: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_blend_02: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> + ret <4 x float> %shuffle6 +} + +define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_blend_123: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_blend_123: +; SSSE3: # BB#0: +; SSSE3-NEXT: movss %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_blend_123: +; SSE41: # BB#0: +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_blend_123: +; AVX: # BB#0: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> + %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef> + %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x float> %shuffle12 +} + +define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test_movhl_1: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test_movhl_1: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test_movhl_2: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test_movhl_2: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2> + ret <4 x i32> %2 +} + +define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: combine_test_movhl_3: +; SSE: # BB#0: +; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_test_movhl_3: +; AVX: # BB#0: +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2> + %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2> + ret <4 x i32> %2 +} + + +; Verify that we fold shuffles according to rule: +; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2) + +define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_undef_input_test1: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test1: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test1: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test1: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_undef_input_test2: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test2: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_undef_input_test3: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test3: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_undef_input_test4: +; SSE: # BB#0: +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test4: +; AVX: # BB#0: +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_undef_input_test5: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test5: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test5: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test5: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7> + ret <4 x float> %2 +} + + +; Verify that we fold shuffles according to rule: +; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) + +define <4 x float> @combine_undef_input_test6(<4 x float> %a) { +; ALL-LABEL: combine_undef_input_test6: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test7(<4 x float> %a) { +; SSE2-LABEL: combine_undef_input_test7: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test7: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test7: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test7: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test8(<4 x float> %a) { +; SSE2-LABEL: combine_undef_input_test8: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test8: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test8: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test8: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test9(<4 x float> %a) { +; SSE-LABEL: combine_undef_input_test9: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test9: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test10(<4 x float> %a) { +; ALL-LABEL: combine_undef_input_test10: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> + %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_undef_input_test11: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test11: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test11: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test11: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_undef_input_test12: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test12: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_undef_input_test13: +; SSE: # BB#0: +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test13: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) { +; SSE-LABEL: combine_undef_input_test14: +; SSE: # BB#0: +; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movapd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test14: +; AVX: # BB#0: +; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) { +; SSE2-LABEL: combine_undef_input_test15: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test15: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd %xmm0, %xmm1 +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test15: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test15: +; AVX: # BB#0: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> + %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> + ret <4 x float> %2 +} + + +; Verify that shuffles are canonicalized according to rules: +; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) +; +; This allows to trigger the following combine rule: +; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) +; +; As a result, all the shuffle pairs in each function below should be +; combined into a single legal shuffle operation. + +define <4 x float> @combine_undef_input_test16(<4 x float> %a) { +; ALL-LABEL: combine_undef_input_test16: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test17(<4 x float> %a) { +; SSE2-LABEL: combine_undef_input_test17: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test17: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test17: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test17: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test18(<4 x float> %a) { +; SSE2-LABEL: combine_undef_input_test18: +; SSE2: # BB#0: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: combine_undef_input_test18: +; SSSE3: # BB#0: +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: combine_undef_input_test18: +; SSE41: # BB#0: +; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test18: +; AVX: # BB#0: +; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test19(<4 x float> %a) { +; SSE-LABEL: combine_undef_input_test19: +; SSE: # BB#0: +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: retq +; +; AVX-LABEL: combine_undef_input_test19: +; AVX: # BB#0: +; AVX-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] +; AVX-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> + ret <4 x float> %2 +} + +define <4 x float> @combine_undef_input_test20(<4 x float> %a) { +; ALL-LABEL: combine_undef_input_test20: +; ALL: # BB#0: +; ALL-NEXT: retq + %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> + %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> + ret <4 x float> %2 +} + +; These tests are designed to test the ability to combine away unnecessary +; operations feeding into a shuffle. The AVX cases are the important ones as +; they leverage operations which cannot be done naturally on the entire vector +; and thus are decomposed into multiple smaller operations. + +define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) { +; SSE-LABEL: combine_unneeded_subvector1: +; SSE: # BB#0: +; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0] +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_unneeded_subvector1: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_unneeded_subvector1: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> + %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> + ret <8 x i32> %c +} + +define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) { +; SSE-LABEL: combine_unneeded_subvector2: +; SSE: # BB#0: +; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] +; SSE-NEXT: retq +; +; AVX1-LABEL: combine_unneeded_subvector2: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_unneeded_subvector2: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: retq + %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> + %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> + ret <8 x i32> %d +} + +define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) { +; SSE41-LABEL: combine_insertps1: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_insertps1: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] +; AVX-NEXT: retq + + %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4> + %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3> + ret <4 x float> %d +} + +define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) { +; SSE41-LABEL: combine_insertps2: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_insertps2: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] +; AVX-NEXT: retq + + %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7> + %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3> + ret <4 x float> %d +} + +define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) { +; SSE41-LABEL: combine_insertps3: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_insertps3: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; AVX-NEXT: retq + + %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> + %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3> + ret <4 x float> %d +} + +define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) { +; SSE41-LABEL: combine_insertps4: +; SSE41: # BB#0: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: combine_insertps4: +; AVX: # BB#0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX-NEXT: retq + + %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> + %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5> + ret <4 x float> %d +} diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll new file mode 100644 index 000000000000..ef60272b6c35 --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-sse1.ll @@ -0,0 +1,235 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=-sse2 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=SSE1 + +target triple = "x86_64-unknown-unknown" + +define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_0001: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_0020: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_0300: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_1000: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_2200: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_3330: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_3210: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_0011: +; SSE1: # BB#0: +; SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_2233: +; SSE1: # BB#0: +; SSE1-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_0022: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2> + ret <4 x float> %shuffle +} +define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) { +; SSE1-LABEL: shuffle_v4f32_1133: +; SSE1: # BB#0: +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) { +; SSE1-LABEL: shuffle_v4f32_4zzz: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: movss %xmm0, %xmm1 +; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) { +; SSE1-LABEL: shuffle_v4f32_z4zz: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0] +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) { +; SSE1-LABEL: shuffle_v4f32_zz4z: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,2] +; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) { +; SSE1-LABEL: shuffle_v4f32_zuu4: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) { +; SSE1-LABEL: shuffle_v4f32_zzz7: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] +; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) { +; SSE1-LABEL: shuffle_v4f32_z6zz: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0] +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] +; SSE1-NEXT: retq + %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_reg_and_zero_v4f32(float %a) { +; SSE1-LABEL: insert_reg_and_zero_v4f32: +; SSE1: # BB#0: +; SSE1-NEXT: xorps %xmm1, %xmm1 +; SSE1-NEXT: movss %xmm0, %xmm1 +; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: retq + %v = insertelement <4 x float> undef, float %a, i32 0 + %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) { +; SSE1-LABEL: insert_mem_and_zero_v4f32: +; SSE1: # BB#0: +; SSE1-NEXT: movss (%rdi), %xmm0 +; SSE1-NEXT: retq + %a = load float* %ptr + %v = insertelement <4 x float> undef, float %a, i32 0 + %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) { +; SSE1-LABEL: insert_mem_lo_v4f32: +; SSE1: # BB#0: +; SSE1-NEXT: movq (%rdi), %rax +; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; SSE1-NEXT: shrq $32, %rax +; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; SSE1-NEXT: movss -{{[0-9]+}}(%rsp), %xmm1 +; SSE1-NEXT: movss -{{[0-9]+}}(%rsp), %xmm2 +; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE1-NEXT: xorps %xmm2, %xmm2 +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1] +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; SSE1-NEXT: movaps %xmm1, %xmm0 +; SSE1-NEXT: retq + %a = load <2 x float>* %ptr + %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> + ret <4 x float> %shuffle +} + +define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) { +; SSE1-LABEL: insert_mem_hi_v4f32: +; SSE1: # BB#0: +; SSE1-NEXT: movq (%rdi), %rax +; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; SSE1-NEXT: shrq $32, %rax +; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; SSE1-NEXT: movss -{{[0-9]+}}(%rsp), %xmm1 +; SSE1-NEXT: movss -{{[0-9]+}}(%rsp), %xmm2 +; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE1-NEXT: xorps %xmm2, %xmm2 +; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,1] +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1] +; SSE1-NEXT: retq + %a = load <2 x float>* %ptr + %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> + %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> + ret <4 x float> %shuffle +} + +define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) { +; SSE1-LABEL: shuffle_mem_v4f32_3210: +; SSE1: # BB#0: +; SSE1-NEXT: movaps (%rdi), %xmm0 +; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; SSE1-NEXT: retq + %a = load <4 x float>* %ptr + %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> + ret <4 x float> %shuffle +} diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll new file mode 100644 index 000000000000..8a5b7488f664 --- /dev/null +++ b/test/CodeGen/X86/vector-trunc.ll @@ -0,0 +1,90 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 + +define i64 @trunc2i64(<2 x i64> %inval) { +; SSE-LABEL: trunc2i64: +; SSE: # BB#0: # %entry +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE-NEXT: movd %xmm0, %rax +; SSE-NEXT: retq + +; AVX-LABEL: trunc2i64: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq + +entry: + %0 = trunc <2 x i64> %inval to <2 x i32> + %1 = bitcast <2 x i32> %0 to i64 + ret i64 %1 +} + +; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 +define i64 @trunc4i32(<4 x i32> %inval) { +; SSE2-LABEL: trunc4i32: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: retq + +; SSSE3-LABEL: trunc4i32: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: retq + +; SSE41-LABEL: trunc4i32: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: retq + +; AVX-LABEL: trunc4i32: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq + +entry: + %0 = trunc <4 x i32> %inval to <4 x i16> + %1 = bitcast <4 x i16> %0 to i64 + ret i64 %1 +} + +; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 +define i64 @trunc8i16(<8 x i16> %inval) { +; SSE2-LABEL: trunc8i16: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: pand .LCP{{.*}}(%rip), %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: retq + +; SSSE3-LABEL: trunc8i16: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: retq + +; SSE41-LABEL: trunc8i16: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: retq + +; AVX-LABEL: trunc8i16: +; AVX: # BB#0: # %entry +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: retq + +entry: + %0 = trunc <8 x i16> %inval to <8 x i8> + %1 = bitcast <8 x i8> %0 to i64 + ret i64 %1 +} diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll new file mode 100644 index 000000000000..cd09deee4550 --- /dev/null +++ b/test/CodeGen/X86/vector-zext.ll @@ -0,0 +1,360 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 + +define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_8i16_to_8i32: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i16_to_8i32: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i16_to_8i32: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovzxwd %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i16_to_8i32: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i16_to_8i32: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxwd %xmm0, %ymm0 +; AVX2-NEXT: retq +entry: + %B = zext <8 x i16> %A to <8 x i32> + ret <8 x i32>%B +} + +define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { +; SSE2-LABEL: zext_4i32_to_4i64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; SSE2-NEXT: pand %xmm3, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_4i32_to_4i64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; SSSE3-NEXT: pand %xmm3, %xmm2 +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSSE3-NEXT: pand %xmm3, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_4i32_to_4i64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovzxdq %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] +; SSE41-NEXT: pand %xmm3, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_4i32_to_4i64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_4i32_to_4i64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxdq %xmm0, %ymm0 +; AVX2-NEXT: retq +entry: + %B = zext <4 x i32> %A to <4 x i64> + ret <4 x i64>%B +} + +define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) { +; SSE2-LABEL: zext_8i8_to_8i32: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_8i8_to_8i32: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_8i8_to_8i32: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovzxwd %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_8i8_to_8i32: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpmovzxwd %xmm0, %xmm1 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_8i8_to_8i32: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxwd %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +entry: + %t = zext <8 x i8> %z to <8 x i32> + ret <8 x i32> %t +} + +; PR17654 +define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) { +; SSE2-LABEL: zext_16i8_to_16i16: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: zext_16i8_to_16i16: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: pand %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: zext_16i8_to_16i16: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: pmovzxbw %xmm0, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pand %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: zext_16i8_to_16i16: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_16i8_to_16i16: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxbw %xmm0, %ymm0 +; AVX2-NEXT: retq +entry: + %t = zext <16 x i8> %z to <16 x i16> + ret <16 x i16> %t +} + +define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) { +; SSE2-LABEL: load_zext_16i8_to_16i16: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: punpckhbw %xmm1, %xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: retq + +; SSSE3-LABEL: load_zext_16i8_to_16i16: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklbw %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: punpckhbw %xmm1, %xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: retq + +; SSE41-LABEL: load_zext_16i8_to_16i16: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: pmovzxbw %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: punpckhbw %xmm1, %xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: retq + +; AVX1-LABEL: load_zext_16i8_to_16i16: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq + +; AVX2-LABEL: load_zext_16i8_to_16i16: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxbw (%rdi), %ymm0 +; AVX2-NEXT: retq +entry: + %X = load <16 x i8>* %ptr + %Y = zext <16 x i8> %X to <16 x i16> + ret <16 x i16> %Y +} + +define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) { +; SSE2-LABEL: load_zext_8i16_to_8i32: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: punpckhwd %xmm1, %xmm1 # xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: retq + +; SSSE3-LABEL: load_zext_8i16_to_8i32: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklwd %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: punpckhwd %xmm1, %xmm1 # xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: retq + +; SSE41-LABEL: load_zext_8i16_to_8i32: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: pmovzxwd %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: punpckhwd %xmm1, %xmm1 # xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: retq + +; AVX1-LABEL: load_zext_8i16_to_8i32: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpmovzxwd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq + +; AVX2-LABEL: load_zext_8i16_to_8i32: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxwd (%rdi), %ymm0 +; AVX2-NEXT: retq +entry: + %X = load <8 x i16>* %ptr + %Y = zext <8 x i16> %X to <8 x i32> + ret <8 x i32>%Y +} + +define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) { +; SSE2-LABEL: load_zext_4i32_to_4i64: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pshufd $-44, %xmm1, %xmm0 # xmm0 = xmm1[0,1,1,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pshufd $-6, %xmm1, %xmm1 # xmm1 = xmm1[2,2,3,3] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: retq + +; SSSE3-LABEL: load_zext_4i32_to_4i64: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: pshufd $-44, %xmm1, %xmm0 # xmm0 = xmm1[0,1,1,3] +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufd $-6, %xmm1, %xmm1 # xmm1 = xmm1[2,2,3,3] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: retq + +; SSE41-LABEL: load_zext_4i32_to_4i64: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: movdqa (%rdi), %xmm1 +; SSE41-NEXT: pmovzxdq %xmm1, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pshufd $-6, %xmm1, %xmm1 # xmm1 = xmm1[2,2,3,3] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: retq + +; AVX1-LABEL: load_zext_4i32_to_4i64: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq + +; AVX2-LABEL: load_zext_4i32_to_4i64: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpmovzxdq (%rdi), %ymm0 +; AVX2-NEXT: retq +entry: + %X = load <4 x i32>* %ptr + %Y = zext <4 x i32> %X to <4 x i64> + ret <4 x i64>%Y +} diff --git a/test/CodeGen/X86/vector-zmov.ll b/test/CodeGen/X86/vector-zmov.ll new file mode 100644 index 000000000000..4de2543a1d6b --- /dev/null +++ b/test/CodeGen/X86/vector-zmov.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 + +define <4 x i32> @load_zmov_4i32_to_0zzz(<4 x i32> *%ptr) { +; SSE-LABEL: load_zmov_4i32_to_0zzz: +; SSE: # BB#0: # %entry +; SSE-NEXT: movd (%rdi), %xmm0 +; SSE-NEXT: retq + +; AVX-LABEL: load_zmov_4i32_to_0zzz: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovd (%rdi), %xmm0 +; AVX-NEXT: retq +entry: + %X = load <4 x i32>* %ptr + %Y = shufflevector <4 x i32> %X, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 4> + ret <4 x i32>%Y +} + +define <2 x i64> @load_zmov_2i64_to_0z(<2 x i64> *%ptr) { +; SSE-LABEL: load_zmov_2i64_to_0z: +; SSE: # BB#0: # %entry +; SSE-NEXT: movq (%rdi), %xmm0 +; SSE-NEXT: retq + +; AVX-LABEL: load_zmov_2i64_to_0z: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovq (%rdi), %xmm0 +; AVX-NEXT: retq +entry: + %X = load <2 x i64>* %ptr + %Y = shufflevector <2 x i64> %X, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2> + ret <2 x i64>%Y +} diff --git a/test/CodeGen/X86/vectorcall.ll b/test/CodeGen/X86/vectorcall.ll new file mode 100644 index 000000000000..1e52654e99fe --- /dev/null +++ b/test/CodeGen/X86/vectorcall.ll @@ -0,0 +1,93 @@ +; RUN: llc -mtriple=i686-pc-win32 -mattr=+sse2 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X86 +; RUN: llc -mtriple=x86_64-pc-win32 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=X64 + +; Test integer arguments. + +define x86_vectorcallcc i32 @test_int_1() { + ret i32 0 +} + +; CHECK-LABEL: {{^}}test_int_1@@0: +; CHECK: xorl %eax, %eax + +define x86_vectorcallcc i32 @test_int_2(i32 inreg %a) { + ret i32 %a +} + +; X86-LABEL: {{^}}test_int_2@@4: +; X64-LABEL: {{^}}test_int_2@@8: +; CHECK: movl %ecx, %eax + +define x86_vectorcallcc i32 @test_int_3(i64 inreg %a) { + %at = trunc i64 %a to i32 + ret i32 %at +} + +; X86-LABEL: {{^}}test_int_3@@8: +; X64-LABEL: {{^}}test_int_3@@8: +; CHECK: movl %ecx, %eax + +define x86_vectorcallcc i32 @test_int_4(i32 inreg %a, i32 inreg %b) { + %s = add i32 %a, %b + ret i32 %s +} + +; X86-LABEL: {{^}}test_int_4@@8: +; X86: leal (%ecx,%edx), %eax + +; X64-LABEL: {{^}}test_int_4@@16: +; X64: leal (%rcx,%rdx), %eax + +define x86_vectorcallcc i32 @"\01test_int_5"(i32, i32) { + ret i32 0 +} +; CHECK-LABEL: {{^}}test_int_5: + +define x86_vectorcallcc double @test_fp_1(double %a, double %b) { + ret double %b +} +; CHECK-LABEL: {{^}}test_fp_1@@16: +; CHECK: movaps %xmm1, %xmm0 + +define x86_vectorcallcc double @test_fp_2( + double, double, double, double, double, double, double %r) { + ret double %r +} +; CHECK-LABEL: {{^}}test_fp_2@@56: +; CHECK: movsd {{[0-9]+\(%[re]sp\)}}, %xmm0 + +define x86_vectorcallcc {double, double, double, double} @test_fp_3() { + ret {double, double, double, double} + { double 0.0, double 0.0, double 0.0, double 0.0 } +} +; CHECK-LABEL: {{^}}test_fp_3@@0: +; CHECK: xorps %xmm0 +; CHECK: xorps %xmm1 +; CHECK: xorps %xmm2 +; CHECK: xorps %xmm3 + +; FIXME: Returning via x87 isn't compatible, but its hard to structure the +; tablegen any other way. +define x86_vectorcallcc {double, double, double, double, double} @test_fp_4() { + ret {double, double, double, double, double} + { double 0.0, double 0.0, double 0.0, double 0.0, double 0.0 } +} +; CHECK-LABEL: {{^}}test_fp_4@@0: +; CHECK: fldz +; CHECK: xorps %xmm0 +; CHECK: xorps %xmm1 +; CHECK: xorps %xmm2 +; CHECK: xorps %xmm3 + +define x86_vectorcallcc <16 x i8> @test_vec_1(<16 x i8> %a, <16 x i8> %b) { + ret <16 x i8> %b +} +; CHECK-LABEL: {{^}}test_vec_1@@32: +; CHECK: movaps %xmm1, %xmm0 + +define x86_vectorcallcc <16 x i8> @test_vec_2( + double, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> %r) { + ret <16 x i8> %r +} +; CHECK-LABEL: {{^}}test_vec_2@@104: +; CHECK: movaps (%{{[re]}}cx), %xmm0 diff --git a/test/CodeGen/X86/vselect-2.ll b/test/CodeGen/X86/vselect-2.ll index 50da32c67a3b..0991bdacd9c5 100644 --- a/test/CodeGen/X86/vselect-2.ll +++ b/test/CodeGen/X86/vselect-2.ll @@ -1,33 +1,60 @@ -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=sse2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) { +; SSE2-LABEL: test1 +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test1 +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: retq %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x i32> %A, <4 x i32> %B ret <4 x i32> %select } -; CHECK-LABEL: test1 -; CHECK: movsd -; CHECK: ret define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) { +; SSE2-LABEL: test2 +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test2 +; SSE41: # BB#0: +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: retq %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x i32> %A, <4 x i32> %B ret <4 x i32> %select } -; CHECK-LABEL: test2 -; CHECK: movsd -; CHECK-NEXT: ret define <4 x float> @test3(<4 x float> %A, <4 x float> %B) { +; SSE2-LABEL: test3 +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test3 +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x float> %A, <4 x float> %B ret <4 x float> %select } -; CHECK-LABEL: test3 -; CHECK: movsd -; CHECK: ret define <4 x float> @test4(<4 x float> %A, <4 x float> %B) { +; SSE2-LABEL: test4 +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test4 +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: retq %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x float> %A, <4 x float> %B ret <4 x float> %select } -; CHECK-LABEL: test4 -; CHECK: movsd -; CHECK-NEXT: ret diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll new file mode 100644 index 000000000000..0c0f4bbf992a --- /dev/null +++ b/test/CodeGen/X86/vselect-avx.ll @@ -0,0 +1,85 @@ +; RUN: llc %s -o - -mattr=+avx | FileCheck %s +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx" + +; For this test we used to optimize the <i1 true, i1 false, i1 false, i1 true> +; mask into <i32 2147483648, i32 0, i32 0, i32 2147483648> because we thought +; we would lower that into a blend where only the high bit is relevant. +; However, since the whole mask is constant, this is simplified incorrectly +; by the generic code, because it was expecting -1 in place of 2147483648. +; +; The problem does not occur without AVX, because vselect of v4i32 is not legal +; nor custom. +; +; <rdar://problem/18675020> + +; CHECK-LABEL: test: +; CHECK: vmovdqa {{.*#+}} xmm0 = [65535,0,0,65535] +; CHECK: vmovdqa {{.*#+}} xmm2 = [65533,124,125,14807] +; CHECK: ret +define void @test(<4 x i16>* %a, <4 x i16>* %b) { +body: + %predphi = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -3, i16 545, i16 4385, i16 14807>, <4 x i16> <i16 123, i16 124, i16 125, i16 127> + %predphi42 = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer + store <4 x i16> %predphi, <4 x i16>* %a, align 8 + store <4 x i16> %predphi42, <4 x i16>* %b, align 8 + ret void +} + +; Improve code coverage. +; +; When shrinking the condition used into the select to match a blend, this +; test case exercises the path where the modified node is not the root +; of the condition. +; +; CHECK-LABEL: test2: +; CHECK: vpslld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 +; CHECK-NEXT: vpshufd $78, %xmm0, %xmm0 ## xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, [[MASK:%ymm[0-9]+]] +; CHECK: vblendvpd [[MASK]] +; CHECK: retq +define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) { +bb: + %arrayidx1928 = getelementptr inbounds double** %call1559, i64 %indvars.iv4198 + %tmp1888 = load double** %arrayidx1928, align 8 + %predphi.v.v = select <4 x i1> %tmp1895, <4 x double> <double -5.000000e-01, double -5.000000e-01, double -5.000000e-01, double -5.000000e-01>, <4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01> + %tmp1900 = bitcast double* %tmp1888 to <4 x double>* + store <4 x double> %predphi.v.v, <4 x double>* %tmp1900, align 8 + ret void +} + +; For this test, we used to optimized the conditional mask for the blend, i.e., +; we shrunk some of its bits. +; However, this same mask was used in another select (%predphi31) that turned out +; to be optimized into a and. In that case, the conditional mask was wrong. +; +; Make sure that the and is fed by the original mask. +; +; <rdar://problem/18819506> + +; Note: For now, hard code ORIG_MASK and SHRUNK_MASK registers, because we +; cannot express that ORIG_MASK must not be equal to ORIG_MASK. Otherwise, +; even a faulty pattern would pass! +; +; CHECK-LABEL: test3: +; Compute the original mask. +; CHECK: vpcmpeqd {{%xmm[0-9]+}}, {{%xmm[0-9]+}}, [[ORIG_MASK:%xmm0]] +; Shrink the bit of the mask. +; CHECK-NEXT: vpslld $31, [[ORIG_MASK]], [[SHRUNK_MASK:%xmm3]] +; Use the shrunk mask in the blend. +; CHECK-NEXT: vblendvps [[SHRUNK_MASK]], %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}} +; Use the original mask in the and. +; CHECK-NEXT: vpand LCPI2_2(%rip), [[ORIG_MASK]], {{%xmm[0-9]+}} +; CHECK: retq +define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) { + %tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3> + %tmp7 = icmp eq <4 x i32> %tmp6, zeroinitializer + %predphi = select <4 x i1> %tmp7, <4 x i16> %tmp3, <4 x i16> %tmp12 + %predphi31 = select <4 x i1> %tmp7, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer + + store <4 x i16> %predphi31, <4 x i16>* %tmp16, align 8 + store <4 x i16> %predphi, <4 x i16>* %tmp17, align 8 + ret void +} diff --git a/test/CodeGen/X86/vselect-minmax.ll b/test/CodeGen/X86/vselect-minmax.ll index 25189f23e43a..3efe5684c15b 100644 --- a/test/CodeGen/X86/vselect-minmax.ll +++ b/test/CodeGen/X86/vselect-minmax.ll @@ -2,6 +2,8 @@ ; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s -check-prefix=SSE4 ; RUN: llc -march=x86-64 -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1 ; RUN: llc -march=x86-64 -mcpu=core-avx2 -mattr=+avx2 < %s | FileCheck %s -check-prefix=AVX2 +; RUN: llc -march=x86-64 -mcpu=knl < %s | FileCheck %s -check-prefix=AVX2 -check-prefix=AVX512F +; RUN: llc -march=x86-64 -mcpu=skx < %s | FileCheck %s -check-prefix=AVX512BW -check-prefix=AVX512VL -check-prefix=AVX512F define void @test1(i8* nocapture %a, i8* nocapture %b) nounwind { vector.ph: @@ -33,6 +35,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test1: ; AVX2: vpminsb + +; AVX512VL-LABEL: test1: +; AVX512VL: vpminsb } define void @test2(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -65,6 +70,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test2: ; AVX2: vpminsb + +; AVX512VL-LABEL: test2: +; AVX512VL: vpminsb } define void @test3(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -97,6 +105,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test3: ; AVX2: vpmaxsb + +; AVX512VL-LABEL: test3: +; AVX512VL: vpmaxsb } define void @test4(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -129,6 +140,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test4: ; AVX2: vpmaxsb + +; AVX512VL-LABEL: test4: +; AVX512VL: vpmaxsb } define void @test5(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -161,6 +175,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test5: ; AVX2: vpminub + +; AVX512VL-LABEL: test5: +; AVX512VL: vpminub } define void @test6(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -193,6 +210,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test6: ; AVX2: vpminub + +; AVX512VL-LABEL: test6: +; AVX512VL: vpminub } define void @test7(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -225,6 +245,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test7: ; AVX2: vpmaxub + +; AVX512VL-LABEL: test7: +; AVX512VL: vpmaxub } define void @test8(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -257,6 +280,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test8: ; AVX2: vpmaxub + +; AVX512VL-LABEL: test8: +; AVX512VL: vpmaxub } define void @test9(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -289,6 +315,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test9: ; AVX2: vpminsw + +; AVX512VL-LABEL: test9: +; AVX512VL: vpminsw } define void @test10(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -321,6 +350,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test10: ; AVX2: vpminsw + +; AVX512VL-LABEL: test10: +; AVX512VL: vpminsw } define void @test11(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -353,6 +385,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test11: ; AVX2: vpmaxsw + +; AVX512VL-LABEL: test11: +; AVX512VL: vpmaxsw } define void @test12(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -385,6 +420,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test12: ; AVX2: vpmaxsw + +; AVX512VL-LABEL: test12: +; AVX512VL: vpmaxsw } define void @test13(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -417,6 +455,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test13: ; AVX2: vpminuw + +; AVX512VL-LABEL: test13: +; AVX512VL: vpminuw } define void @test14(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -449,6 +490,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test14: ; AVX2: vpminuw + +; AVX512VL-LABEL: test14: +; AVX512VL: vpminuw } define void @test15(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -481,6 +525,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test15: ; AVX2: vpmaxuw + +; AVX512VL-LABEL: test15: +; AVX512VL: vpmaxuw } define void @test16(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -513,6 +560,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test16: ; AVX2: vpmaxuw + +; AVX512VL-LABEL: test16: +; AVX512VL: vpmaxuw } define void @test17(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -545,6 +595,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test17: ; AVX2: vpminsd + +; AVX512VL-LABEL: test17: +; AVX512VL: vpminsd } define void @test18(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -577,6 +630,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test18: ; AVX2: vpminsd + +; AVX512VL-LABEL: test18: +; AVX512VL: vpminsd } define void @test19(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -609,6 +665,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test19: ; AVX2: vpmaxsd + +; AVX512VL-LABEL: test19: +; AVX512VL: vpmaxsd } define void @test20(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -641,6 +700,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test20: ; AVX2: vpmaxsd + +; AVX512VL-LABEL: test20: +; AVX512VL: vpmaxsd } define void @test21(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -673,6 +735,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test21: ; AVX2: vpminud + +; AVX512VL-LABEL: test21: +; AVX512VL: vpminud } define void @test22(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -705,6 +770,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test22: ; AVX2: vpminud + +; AVX512VL-LABEL: test22: +; AVX512VL: vpminud } define void @test23(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -737,6 +805,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test23: ; AVX2: vpmaxud + +; AVX512VL-LABEL: test23: +; AVX512VL: vpmaxud } define void @test24(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -769,6 +840,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test24: ; AVX2: vpmaxud + +; AVX512VL-LABEL: test24: +; AVX512VL: vpmaxud } define void @test25(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -795,6 +869,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test25: ; AVX2: vpminsb + +; AVX512VL-LABEL: test25: +; AVX512VL: vpminsb } define void @test26(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -821,6 +898,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test26: ; AVX2: vpminsb + +; AVX512VL-LABEL: test26: +; AVX512VL: vpminsb } define void @test27(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -847,6 +927,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test27: ; AVX2: vpmaxsb + +; AVX512VL-LABEL: test27: +; AVX512VL: vpmaxsb } define void @test28(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -873,6 +956,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test28: ; AVX2: vpmaxsb + +; AVX512VL-LABEL: test28: +; AVX512VL: vpmaxsb } define void @test29(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -899,6 +985,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test29: ; AVX2: vpminub + +; AVX512VL-LABEL: test29: +; AVX512VL: vpminub } define void @test30(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -925,6 +1014,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test30: ; AVX2: vpminub + +; AVX512VL-LABEL: test30: +; AVX512VL: vpminub } define void @test31(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -951,6 +1043,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test31: ; AVX2: vpmaxub + +; AVX512VL-LABEL: test31: +; AVX512VL: vpmaxub } define void @test32(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -977,6 +1072,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test32: ; AVX2: vpmaxub + +; AVX512VL-LABEL: test32: +; AVX512VL: vpmaxub } define void @test33(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1003,6 +1101,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test33: ; AVX2: vpminsw + +; AVX512VL-LABEL: test33: +; AVX512VL: vpminsw } define void @test34(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1029,6 +1130,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test34: ; AVX2: vpminsw + +; AVX512VL-LABEL: test34: +; AVX512VL: vpminsw } define void @test35(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1055,6 +1159,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test35: ; AVX2: vpmaxsw + +; AVX512VL-LABEL: test35: +; AVX512VL: vpmaxsw } define void @test36(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1081,6 +1188,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test36: ; AVX2: vpmaxsw + +; AVX512VL-LABEL: test36: +; AVX512VL: vpmaxsw } define void @test37(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1107,6 +1217,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test37: ; AVX2: vpminuw + +; AVX512VL-LABEL: test37: +; AVX512VL: vpminuw } define void @test38(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1133,6 +1246,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test38: ; AVX2: vpminuw + +; AVX512VL-LABEL: test38: +; AVX512VL: vpminuw } define void @test39(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1159,6 +1275,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test39: ; AVX2: vpmaxuw + +; AVX512VL-LABEL: test39: +; AVX512VL: vpmaxuw } define void @test40(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1185,6 +1304,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test40: ; AVX2: vpmaxuw + +; AVX512VL-LABEL: test40: +; AVX512VL: vpmaxuw } define void @test41(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -1211,6 +1333,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test41: ; AVX2: vpminsd + +; AVX512VL-LABEL: test41: +; AVX512VL: vpminsd } define void @test42(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -1237,6 +1362,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test42: ; AVX2: vpminsd + +; AVX512VL-LABEL: test42: +; AVX512VL: vpminsd } define void @test43(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -1263,6 +1391,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test43: ; AVX2: vpmaxsd + +; AVX512VL-LABEL: test43: +; AVX512VL: vpmaxsd } define void @test44(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -1289,6 +1420,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test44: ; AVX2: vpmaxsd + +; AVX512VL-LABEL: test44: +; AVX512VL: vpmaxsd } define void @test45(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -1315,6 +1449,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test45: ; AVX2: vpminud + +; AVX512VL-LABEL: test45: +; AVX512VL: vpminud } define void @test46(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -1341,6 +1478,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test46: ; AVX2: vpminud + +; AVX512VL-LABEL: test46: +; AVX512VL: vpminud } define void @test47(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -1367,6 +1507,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test47: ; AVX2: vpmaxud + +; AVX512VL-LABEL: test47: +; AVX512VL: vpmaxud } define void @test48(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -1393,6 +1536,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test48: ; AVX2: vpmaxud + +; AVX512VL-LABEL: test48: +; AVX512VL: vpmaxud } define void @test49(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -1425,6 +1571,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test49: ; AVX2: vpmaxsb + +; AVX512VL-LABEL: test49: +; AVX512VL: vpmaxsb } define void @test50(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -1457,6 +1606,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test50: ; AVX2: vpmaxsb + +; AVX512VL-LABEL: test50: +; AVX512VL: vpmaxsb } define void @test51(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -1489,6 +1641,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test51: ; AVX2: vpminsb + +; AVX512VL-LABEL: test51: +; AVX512VL: vpminsb } define void @test52(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -1521,6 +1676,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test52: ; AVX2: vpminsb + +; AVX512VL-LABEL: test52: +; AVX512VL: vpminsb } define void @test53(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -1553,6 +1711,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test53: ; AVX2: vpmaxub + +; AVX512VL-LABEL: test53: +; AVX512VL: vpmaxub } define void @test54(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -1585,6 +1746,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test54: ; AVX2: vpmaxub + +; AVX512VL-LABEL: test54: +; AVX512VL: vpmaxub } define void @test55(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -1617,6 +1781,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test55: ; AVX2: vpminub + +; AVX512VL-LABEL: test55: +; AVX512VL: vpminub } define void @test56(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -1649,6 +1816,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test56: ; AVX2: vpminub + +; AVX512VL-LABEL: test56: +; AVX512VL: vpminub } define void @test57(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1681,6 +1851,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test57: ; AVX2: vpmaxsw + +; AVX512VL-LABEL: test57: +; AVX512VL: vpmaxsw } define void @test58(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1713,6 +1886,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test58: ; AVX2: vpmaxsw + +; AVX512VL-LABEL: test58: +; AVX512VL: vpmaxsw } define void @test59(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1745,6 +1921,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test59: ; AVX2: vpminsw + +; AVX512VL-LABEL: test59: +; AVX512VL: vpminsw } define void @test60(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1777,6 +1956,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test60: ; AVX2: vpminsw + +; AVX512VL-LABEL: test60: +; AVX512VL: vpminsw } define void @test61(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1809,6 +1991,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test61: ; AVX2: vpmaxuw + +; AVX512VL-LABEL: test61: +; AVX512VL: vpmaxuw } define void @test62(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1841,6 +2026,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test62: ; AVX2: vpmaxuw + +; AVX512VL-LABEL: test62: +; AVX512VL: vpmaxuw } define void @test63(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1873,6 +2061,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test63: ; AVX2: vpminuw + +; AVX512VL-LABEL: test63: +; AVX512VL: vpminuw } define void @test64(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -1905,6 +2096,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test64: ; AVX2: vpminuw + +; AVX512VL-LABEL: test64: +; AVX512VL: vpminuw } define void @test65(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -1937,6 +2131,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test65: ; AVX2: vpmaxsd + +; AVX512VL-LABEL: test65: +; AVX512VL: vpmaxsd } define void @test66(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -1969,6 +2166,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test66: ; AVX2: vpmaxsd + +; AVX512VL-LABEL: test66: +; AVX512VL: vpmaxsd } define void @test67(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2001,6 +2201,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test67: ; AVX2: vpminsd + +; AVX512VL-LABEL: test67: +; AVX512VL: vpminsd } define void @test68(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2033,6 +2236,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test68: ; AVX2: vpminsd + +; AVX512VL-LABEL: test68: +; AVX512VL: vpminsd } define void @test69(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2065,6 +2271,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test69: ; AVX2: vpmaxud + +; AVX512VL-LABEL: test69: +; AVX512VL: vpmaxud } define void @test70(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2097,6 +2306,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test70: ; AVX2: vpmaxud + +; AVX512VL-LABEL: test70: +; AVX512VL: vpmaxud } define void @test71(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2129,6 +2341,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test71: ; AVX2: vpminud + +; AVX512VL-LABEL: test71: +; AVX512VL: vpminud } define void @test72(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2161,6 +2376,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test72: ; AVX2: vpminud + +; AVX512VL-LABEL: test72: +; AVX512VL: vpminud } define void @test73(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -2187,6 +2405,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test73: ; AVX2: vpmaxsb + +; AVX512VL-LABEL: test73: +; AVX512VL: vpmaxsb } define void @test74(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -2213,6 +2434,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test74: ; AVX2: vpmaxsb + +; AVX512VL-LABEL: test74: +; AVX512VL: vpmaxsb } define void @test75(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -2239,6 +2463,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test75: ; AVX2: vpminsb + +; AVX512VL-LABEL: test75: +; AVX512VL: vpminsb } define void @test76(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -2265,6 +2492,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test76: ; AVX2: vpminsb + +; AVX512VL-LABEL: test76: +; AVX512VL: vpminsb } define void @test77(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -2291,6 +2521,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test77: ; AVX2: vpmaxub + +; AVX512VL-LABEL: test77: +; AVX512VL: vpmaxub } define void @test78(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -2317,6 +2550,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test78: ; AVX2: vpmaxub + +; AVX512VL-LABEL: test78: +; AVX512VL: vpmaxub } define void @test79(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -2343,6 +2579,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test79: ; AVX2: vpminub + +; AVX512VL-LABEL: test79: +; AVX512VL: vpminub } define void @test80(i8* nocapture %a, i8* nocapture %b) nounwind { @@ -2369,6 +2608,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test80: ; AVX2: vpminub + +; AVX512VL-LABEL: test80: +; AVX512VL: vpminub } define void @test81(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -2395,6 +2637,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test81: ; AVX2: vpmaxsw + +; AVX512VL-LABEL: test81: +; AVX512VL: vpmaxsw } define void @test82(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -2421,6 +2666,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test82: ; AVX2: vpmaxsw + +; AVX512VL-LABEL: test82: +; AVX512VL: vpmaxsw } define void @test83(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -2447,6 +2695,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test83: ; AVX2: vpminsw + +; AVX512VL-LABEL: test83: +; AVX512VL: vpminsw } define void @test84(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -2473,6 +2724,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test84: ; AVX2: vpminsw + +; AVX512VL-LABEL: test84: +; AVX512VL: vpminsw } define void @test85(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -2499,6 +2753,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test85: ; AVX2: vpmaxuw + +; AVX512VL-LABEL: test85: +; AVX512VL: vpmaxuw } define void @test86(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -2525,6 +2782,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test86: ; AVX2: vpmaxuw + +; AVX512VL-LABEL: test86: +; AVX512VL: vpmaxuw } define void @test87(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -2551,6 +2811,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test87: ; AVX2: vpminuw + +; AVX512VL-LABEL: test87: +; AVX512VL: vpminuw } define void @test88(i16* nocapture %a, i16* nocapture %b) nounwind { @@ -2577,6 +2840,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test88: ; AVX2: vpminuw + +; AVX512VL-LABEL: test88: +; AVX512VL: vpminuw } define void @test89(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2603,6 +2869,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test89: ; AVX2: vpmaxsd + +; AVX512VL-LABEL: test89: +; AVX512VL: vpmaxsd } define void @test90(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2629,6 +2898,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test90: ; AVX2: vpmaxsd + +; AVX512VL-LABEL: test90: +; AVX512VL: vpmaxsd } define void @test91(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2655,6 +2927,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test91: ; AVX2: vpminsd + +; AVX512VL-LABEL: test91: +; AVX512VL: vpminsd } define void @test92(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2681,6 +2956,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test92: ; AVX2: vpminsd + +; AVX512VL-LABEL: test92: +; AVX512VL: vpminsd } define void @test93(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2707,6 +2985,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test93: ; AVX2: vpmaxud + +; AVX512VL-LABEL: test93: +; AVX512VL: vpmaxud } define void @test94(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2733,6 +3014,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test94: ; AVX2: vpmaxud + +; AVX512VL-LABEL: test94: +; AVX512VL: vpmaxud } define void @test95(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2759,6 +3043,9 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test95: ; AVX2: vpminud + +; AVX512VL-LABEL: test95: +; AVX512VL: vpminud } define void @test96(i32* nocapture %a, i32* nocapture %b) nounwind { @@ -2785,4 +3072,2507 @@ for.end: ; preds = %vector.body ; AVX2-LABEL: test96: ; AVX2: vpminud + +; AVX512VL-LABEL: test96: +; AVX512VL: vpminud +} + +; ---------------------------- + +define void @test97(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp slt <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test97: +; AVX512BW: vpminsb {{.*}} +} + +define void @test98(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp sle <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test98: +; AVX512BW: vpminsb {{.*}} +} + +define void @test99(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp sgt <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test99: +; AVX512BW: vpmaxsb {{.*}} +} + +define void @test100(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp sge <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test100: +; AVX512BW: vpmaxsb {{.*}} +} + +define void @test101(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp ult <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test101: +; AVX512BW: vpminub {{.*}} +} + +define void @test102(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp ule <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test102: +; AVX512BW: vpminub {{.*}} +} + +define void @test103(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp ugt <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test103: +; AVX512BW: vpmaxub {{.*}} +} + +define void @test104(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp uge <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.a, <64 x i8> %load.b + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test104: +; AVX512BW: vpmaxub {{.*}} +} + +define void @test105(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp slt <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test105: +; AVX512BW: vpminsw {{.*}} +} + +define void @test106(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp sle <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test106: +; AVX512BW: vpminsw {{.*}} +} + +define void @test107(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp sgt <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test107: +; AVX512BW: vpmaxsw {{.*}} +} + +define void @test108(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp sge <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test108: +; AVX512BW: vpmaxsw {{.*}} +} + +define void @test109(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp ult <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test109: +; AVX512BW: vpminuw {{.*}} +} + +define void @test110(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp ule <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test110: +; AVX512BW: vpminuw {{.*}} +} + +define void @test111(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp ugt <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test111: +; AVX512BW: vpmaxuw {{.*}} +} + +define void @test112(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp uge <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.a, <32 x i16> %load.b + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test112: +; AVX512BW: vpmaxuw {{.*}} +} + +define void @test113(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp slt <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test113: +; AVX512F: vpminsd {{.*}} +} + +define void @test114(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp sle <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test114: +; AVX512F: vpminsd {{.*}} +} + +define void @test115(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp sgt <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test115: +; AVX512F: vpmaxsd {{.*}} +} + +define void @test116(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp sge <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test116: +; AVX512F: vpmaxsd {{.*}} +} + +define void @test117(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp ult <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test117: +; AVX512F: vpminud {{.*}} +} + +define void @test118(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp ule <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test118: +; AVX512F: vpminud {{.*}} +} + +define void @test119(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp ugt <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test119: +; AVX512F: vpmaxud {{.*}} +} + +define void @test120(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp uge <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.a, <16 x i32> %load.b + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test120: +; AVX512F: vpmaxud {{.*}} +} + +define void @test121(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp slt <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test121: +; AVX512F: vpminsq {{.*}} +} + +define void @test122(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp sle <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test122: +; AVX512F: vpminsq {{.*}} +} + +define void @test123(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp sgt <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test123: +; AVX512F: vpmaxsq {{.*}} +} + +define void @test124(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp sge <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test124: +; AVX512F: vpmaxsq {{.*}} +} + +define void @test125(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp ult <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test125: +; AVX512F: vpminuq {{.*}} +} + +define void @test126(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp ule <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test126: +; AVX512F: vpminuq {{.*}} +} + +define void @test127(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp ugt <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test127: +; AVX512F: vpmaxuq {{.*}} +} + +define void @test128(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp uge <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.a, <8 x i64> %load.b + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test128: +; AVX512F: vpmaxuq {{.*}} +} + +define void @test129(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp slt <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test129: +; AVX512BW: vpmaxsb +} + +define void @test130(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp sle <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test130: +; AVX512BW: vpmaxsb +} + +define void @test131(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp sgt <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test131: +; AVX512BW: vpminsb +} + +define void @test132(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp sge <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test132: +; AVX512BW: vpminsb +} + +define void @test133(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp ult <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test133: +; AVX512BW: vpmaxub +} + +define void @test134(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp ule <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test134: +; AVX512BW: vpmaxub +} + +define void @test135(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp ugt <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test135: +; AVX512BW: vpminub +} + +define void @test136(i8* nocapture %a, i8* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i8* %a, i64 %index + %gep.b = getelementptr inbounds i8* %b, i64 %index + %ptr.a = bitcast i8* %gep.a to <64 x i8>* + %ptr.b = bitcast i8* %gep.b to <64 x i8>* + %load.a = load <64 x i8>* %ptr.a, align 2 + %load.b = load <64 x i8>* %ptr.b, align 2 + %cmp = icmp uge <64 x i8> %load.a, %load.b + %sel = select <64 x i1> %cmp, <64 x i8> %load.b, <64 x i8> %load.a + store <64 x i8> %sel, <64 x i8>* %ptr.a, align 2 + %index.next = add i64 %index, 32 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test136: +; AVX512BW: vpminub +} + +define void @test137(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp slt <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test137: +; AVX512BW: vpmaxsw +} + +define void @test138(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp sle <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test138: +; AVX512BW: vpmaxsw +} + +define void @test139(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp sgt <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test139: +; AVX512BW: vpminsw +} + +define void @test140(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp sge <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test140: +; AVX512BW: vpminsw +} + +define void @test141(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp ult <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test141: +; AVX512BW: vpmaxuw +} + +define void @test142(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp ule <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test142: +; AVX512BW: vpmaxuw +} + +define void @test143(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp ugt <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test143: +; AVX512BW: vpminuw +} + +define void @test144(i16* nocapture %a, i16* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i16* %a, i64 %index + %gep.b = getelementptr inbounds i16* %b, i64 %index + %ptr.a = bitcast i16* %gep.a to <32 x i16>* + %ptr.b = bitcast i16* %gep.b to <32 x i16>* + %load.a = load <32 x i16>* %ptr.a, align 2 + %load.b = load <32 x i16>* %ptr.b, align 2 + %cmp = icmp uge <32 x i16> %load.a, %load.b + %sel = select <32 x i1> %cmp, <32 x i16> %load.b, <32 x i16> %load.a + store <32 x i16> %sel, <32 x i16>* %ptr.a, align 2 + %index.next = add i64 %index, 16 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512BW-LABEL: test144: +; AVX512BW: vpminuw +} + +define void @test145(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp slt <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test145: +; AVX512F: vpmaxsd +} + +define void @test146(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp sle <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test146: +; AVX512F: vpmaxsd +} + +define void @test147(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp sgt <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test147: +; AVX512F: vpminsd +} + +define void @test148(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp sge <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test148: +; AVX512F: vpminsd +} + +define void @test149(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp ult <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test149: +; AVX512F: vpmaxud +} + +define void @test150(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp ule <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test150: +; AVX512F: vpmaxud +} + +define void @test151(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp ugt <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test151: +; AVX512F: vpminud +} + +define void @test152(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <16 x i32>* + %ptr.b = bitcast i32* %gep.b to <16 x i32>* + %load.a = load <16 x i32>* %ptr.a, align 2 + %load.b = load <16 x i32>* %ptr.b, align 2 + %cmp = icmp uge <16 x i32> %load.a, %load.b + %sel = select <16 x i1> %cmp, <16 x i32> %load.b, <16 x i32> %load.a + store <16 x i32> %sel, <16 x i32>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test152: +; AVX512F: vpminud +} + +; ----------------------- + +define void @test153(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp slt <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test153: +; AVX512F: vpmaxsq +} + +define void @test154(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp sle <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test154: +; AVX512F: vpmaxsq +} + +define void @test155(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp sgt <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test155: +; AVX512F: vpminsq +} + +define void @test156(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp sge <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test156: +; AVX512F: vpminsq +} + +define void @test157(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp ult <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test157: +; AVX512F: vpmaxuq +} + +define void @test158(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp ule <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test158: +; AVX512F: vpmaxuq +} + +define void @test159(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp ugt <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test159: +; AVX512F: vpminuq +} + +define void @test160(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <8 x i64>* + %ptr.b = bitcast i32* %gep.b to <8 x i64>* + %load.a = load <8 x i64>* %ptr.a, align 2 + %load.b = load <8 x i64>* %ptr.b, align 2 + %cmp = icmp uge <8 x i64> %load.a, %load.b + %sel = select <8 x i1> %cmp, <8 x i64> %load.b, <8 x i64> %load.a + store <8 x i64> %sel, <8 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512F-LABEL: test160: +; AVX512F: vpminuq +} + +define void @test161(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp slt <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test161: +; AVX512VL: vpminsq +} + +define void @test162(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp sle <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test162: +; AVX512VL: vpminsq +} + +define void @test163(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp sgt <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test163: +; AVX512VL: vpmaxsq +} + +define void @test164(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp sge <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test164: +; AVX512VL: vpmaxsq +} + +define void @test165(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp ult <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test165: +; AVX512VL: vpminuq +} + +define void @test166(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp ule <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test166: +; AVX512VL: vpminuq +} + +define void @test167(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp ugt <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test167: +; AVX512VL: vpmaxuq +} + +define void @test168(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp uge <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.a, <4 x i64> %load.b + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test168: +; AVX512VL: vpmaxuq +} + +define void @test169(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp slt <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test169: +; AVX512VL: vpmaxsq +} + +define void @test170(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp sle <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test170: +; AVX512VL: vpmaxsq +} + +define void @test171(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp sgt <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test171: +; AVX512VL: vpminsq +} + +define void @test172(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp sge <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test172: +; AVX512VL: vpminsq +} + +define void @test173(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp ult <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test173: +; AVX512VL: vpmaxuq +} + +define void @test174(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp ule <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test174: +; AVX512VL: vpmaxuq +} + +define void @test175(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp ugt <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test175: +; AVX512VL: vpminuq +} + +define void @test176(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <4 x i64>* + %ptr.b = bitcast i32* %gep.b to <4 x i64>* + %load.a = load <4 x i64>* %ptr.a, align 2 + %load.b = load <4 x i64>* %ptr.b, align 2 + %cmp = icmp uge <4 x i64> %load.a, %load.b + %sel = select <4 x i1> %cmp, <4 x i64> %load.b, <4 x i64> %load.a + store <4 x i64> %sel, <4 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test176: +; AVX512VL: vpminuq +} + +define void @test177(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp slt <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test177: +; AVX512VL: vpminsq +} + +define void @test178(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp sle <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test178: +; AVX512VL: vpminsq +} + +define void @test179(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp sgt <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test179: +; AVX512VL: vpmaxsq +} + +define void @test180(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp sge <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test180: +; AVX512VL: vpmaxsq +} + +define void @test181(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp ult <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test181: +; AVX512VL: vpminuq +} + +define void @test182(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp ule <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test182: +; AVX512VL: vpminuq +} + +define void @test183(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp ugt <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test183: +; AVX512VL: vpmaxuq +} + +define void @test184(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp uge <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.a, <2 x i64> %load.b + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test184: +; AVX512VL: vpmaxuq +} + +define void @test185(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp slt <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test185: +; AVX512VL: vpmaxsq +} + +define void @test186(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp sle <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test186: +; AVX512VL: vpmaxsq +} + +define void @test187(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp sgt <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test187: +; AVX512VL: vpminsq +} + +define void @test188(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp sge <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test188: +; AVX512VL: vpminsq +} + +define void @test189(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp ult <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test189: +; AVX512VL: vpmaxuq +} + +define void @test190(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp ule <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test190: +; AVX512VL: vpmaxuq +} + +define void @test191(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp ugt <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test191: +; AVX512VL: vpminuq +} + +define void @test192(i32* nocapture %a, i32* nocapture %b) nounwind { +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %gep.a = getelementptr inbounds i32* %a, i64 %index + %gep.b = getelementptr inbounds i32* %b, i64 %index + %ptr.a = bitcast i32* %gep.a to <2 x i64>* + %ptr.b = bitcast i32* %gep.b to <2 x i64>* + %load.a = load <2 x i64>* %ptr.a, align 2 + %load.b = load <2 x i64>* %ptr.b, align 2 + %cmp = icmp uge <2 x i64> %load.a, %load.b + %sel = select <2 x i1> %cmp, <2 x i64> %load.b, <2 x i64> %load.a + store <2 x i64> %sel, <2 x i64>* %ptr.a, align 2 + %index.next = add i64 %index, 8 + %loop = icmp eq i64 %index.next, 16384 + br i1 %loop, label %for.end, label %vector.body + +for.end: ; preds = %vector.body + ret void + +; AVX512VL-LABEL: test192: +; AVX512VL: vpminuq } diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll index 42cf06a4a049..3bd1dc4cb972 100644 --- a/test/CodeGen/X86/vselect.ll +++ b/test/CodeGen/X86/vselect.ll @@ -3,270 +3,253 @@ ; Verify that we don't emit packed vector shifts instructions if the ; condition used by the vector select is a vector of constants. - define <4 x float> @test1(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test1: +; CHECK: # BB#0: +; CHECK-NEXT: andps {{.*}}(%rip), %xmm1 +; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test1 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret - define <4 x float> @test2(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test2: +; CHECK: # BB#0: +; CHECK-NEXT: movsd %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test2 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret - define <4 x float> @test3(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test3: +; CHECK: # BB#0: +; CHECK-NEXT: movsd %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test3 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret - define <4 x float> @test4(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test4: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test4 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: movaps %xmm1, %xmm0 -; CHECK: ret - define <4 x float> @test5(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test5: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test5 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret - define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test6: +; CHECK: # BB#0: +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [0,65535,0,65535,0,65535,0,65535] +; CHECK-NEXT: andps %xmm0, %xmm1 +; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> %a, <8 x i16> %a ret <8 x i16> %1 } -; CHECK-LABEL: test6 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret - define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test7: +; CHECK: # BB#0: +; CHECK-NEXT: andps {{.*}}(%rip), %xmm1 +; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test7 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret - define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test8: +; CHECK: # BB#0: +; CHECK-NEXT: andps {{.*}}(%rip), %xmm1 +; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test8 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test9: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test9 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: movaps %xmm1, %xmm0 -; CHECK-NEXT: ret define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test10: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test10 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test11: +; CHECK: # BB#0: +; CHECK-NEXT: movaps {{.*#+}} xmm2 = <0,65535,65535,0,u,65535,65535,u> +; CHECK-NEXT: andps %xmm2, %xmm0 +; CHECK-NEXT: andnps %xmm1, %xmm2 +; CHECK-NEXT: orps %xmm2, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test11 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret define <8 x i16> @test12(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test12: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 false, i1 false, i1 undef, i1 false, i1 false, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test12 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret define <8 x i16> @test13(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test13: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test13 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK: ret ; Fold (vselect (build_vector AllOnes), N1, N2) -> N1 - define <4 x float> @test14(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test14: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 true, i1 undef, i1 true, i1 undef>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test14 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: pcmpeq -; CHECK: ret define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test15: +; CHECK: # BB#0: +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 undef, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test15 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: pcmpeq -; CHECK: ret ; Fold (vselect (build_vector AllZeros), N1, N2) -> N2 - define <4 x float> @test16(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test16: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 false, i1 undef, i1 false, i1 undef>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 -} -; CHECK-LABEL: test16 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: ret +} define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test17: +; CHECK: # BB#0: +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b ret <8 x i16> %1 } -; CHECK-LABEL: test17 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: ret define <4 x float> @test18(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test18: +; CHECK: # BB#0: +; CHECK-NEXT: movss %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test18 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movss -; CHECK: ret define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test19: +; CHECK: # BB#0: +; CHECK-NEXT: movss %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> %a, <4 x i32> %b ret <4 x i32> %1 } -; CHECK-LABEL: test19 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movss -; CHECK: ret define <2 x double> @test20(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test20: +; CHECK: # BB#0: +; CHECK-NEXT: movsd %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %b ret <2 x double> %1 } -; CHECK-LABEL: test20 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK: ret define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test21: +; CHECK: # BB#0: +; CHECK-NEXT: movsd %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <2 x i1> <i1 false, i1 true>, <2 x i64> %a, <2 x i64> %b ret <2 x i64> %1 } -; CHECK-LABEL: test21 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK: ret define <4 x float> @test22(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test22: +; CHECK: # BB#0: +; CHECK-NEXT: movss %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b ret <4 x float> %1 } -; CHECK-LABEL: test22 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movss -; CHECK: ret define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test23: +; CHECK: # BB#0: +; CHECK-NEXT: movss %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %a, <4 x i32> %b ret <4 x i32> %1 } -; CHECK-LABEL: test23 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movss -; CHECK: ret define <2 x double> @test24(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: test24: +; CHECK: # BB#0: +; CHECK-NEXT: movsd %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b ret <2 x double> %1 } -; CHECK-LABEL: test24 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK: ret define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test25: +; CHECK: # BB#0: +; CHECK-NEXT: movsd %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = select <2 x i1> <i1 true, i1 false>, <2 x i64> %a, <2 x i64> %b ret <2 x i64> %1 } -; CHECK-LABEL: test25 -; CHECK-NOT: psllw -; CHECK-NOT: psraw -; CHECK-NOT: xorps -; CHECK: movsd -; CHECK: ret define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x float> %a1, <2 x float> %b1) { -; CHECK-LABEL: select_of_shuffles_0 -; CHECK-DAG: movlhps %xmm2, [[REGA:%xmm[0-9]+]] -; CHECK-DAG: movlhps %xmm3, [[REGB:%xmm[0-9]+]] -; CHECK: subps [[REGB]], [[REGA]] +; CHECK-LABEL: select_of_shuffles_0: +; CHECK: # BB#0: +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: subps %xmm1, %xmm0 +; CHECK-NEXT: retq %1 = shufflevector <2 x float> %a0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> %2 = shufflevector <2 x float> %a1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1> %3 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %2, <4 x float> %1 @@ -276,3 +259,24 @@ define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x %7 = fsub <4 x float> %3, %6 ret <4 x float> %7 } + +; PR20677 +define <16 x double> @select_illegal(<16 x double> %a, <16 x double> %b) { +; CHECK-LABEL: select_illegal: +; CHECK: # BB#0: +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6 +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm7 +; CHECK-NEXT: movaps %xmm7, 112(%rdi) +; CHECK-NEXT: movaps %xmm6, 96(%rdi) +; CHECK-NEXT: movaps %xmm5, 80(%rdi) +; CHECK-NEXT: movaps %xmm4, 64(%rdi) +; CHECK-NEXT: movaps %xmm3, 48(%rdi) +; CHECK-NEXT: movaps %xmm2, 32(%rdi) +; CHECK-NEXT: movaps %xmm1, 16(%rdi) +; CHECK-NEXT: movaps %xmm0, (%rdi) +; CHECK-NEXT: retq + %sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b + ret <16 x double> %sel +} diff --git a/test/CodeGen/X86/vshift-4.ll b/test/CodeGen/X86/vshift-4.ll index a060cf803727..cda9bc893a2d 100644 --- a/test/CodeGen/X86/vshift-4.ll +++ b/test/CodeGen/X86/vshift-4.ll @@ -57,7 +57,7 @@ entry: define void @shift3a(<8 x i16> %val, <8 x i16>* %dst, <8 x i16> %amt) nounwind { entry: ; CHECK-LABEL: shift3a: -; CHECK: movzwl +; CHECK: pextrw $6 ; CHECK: psllw %shamt = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6> %shl = shl <8 x i16> %val, %shamt diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll index d115929f5aab..e0b861f29de8 100644 --- a/test/CodeGen/X86/widen_cast-1.ll +++ b/test/CodeGen/X86/widen_cast-1.ll @@ -2,12 +2,12 @@ ; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s ; CHECK: movl -; CHECK: paddd +; CHECK: paddw ; CHECK: movlpd ; Scheduler causes produce a different instruction order ; ATOM: movl -; ATOM: paddd +; ATOM: paddw ; ATOM: movlpd ; bitcast a v4i16 to v2i32 diff --git a/test/CodeGen/X86/widen_conv-1.ll b/test/CodeGen/X86/widen_conv-1.ll index 9f6778cff592..3f54ab694c07 100644 --- a/test/CodeGen/X86/widen_conv-1.ll +++ b/test/CodeGen/X86/widen_conv-1.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s -; CHECK: paddq +; CHECK: paddd ; truncate v2i64 to v2i32 diff --git a/test/CodeGen/X86/widen_conversions.ll b/test/CodeGen/X86/widen_conversions.ll index 522ab475c2a0..8e5174fbe76e 100644 --- a/test/CodeGen/X86/widen_conversions.ll +++ b/test/CodeGen/X86/widen_conversions.ll @@ -9,7 +9,7 @@ define <4 x i32> @zext_v4i8_to_v4i32(<4 x i8>* %ptr) { ; CHECK: movd (%{{.*}}), %[[X:xmm[0-9]+]] ; CHECK-NEXT: pxor %[[Z:xmm[0-9]+]], %[[Z]] ; CHECK-NEXT: punpcklbw %[[Z]], %[[X]] -; CHECK-NEXT: punpcklbw %[[Z]], %[[X]] +; CHECK-NEXT: punpcklwd %[[Z]], %[[X]] ; CHECK-NEXT: ret %val = load <4 x i8>* %ptr diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll index 41bea859f474..0ec3574d69eb 100644 --- a/test/CodeGen/X86/widen_load-2.ll +++ b/test/CodeGen/X86/widen_load-2.ll @@ -4,12 +4,12 @@ ; %i32vec3 = type <3 x i32> -; CHECK: add3i32 define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { -; CHECK: movdqa -; CHECK: paddd -; CHECK: pextrd -; CHECK: movq +; CHECK-LABEL: add3i32: +; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: paddd (%{{.*}}), %[[R0]] +; CHECK-NEXT: pextrd $2, %[[R0]], 8(%{{.*}}) +; CHECK-NEXT: movq %[[R0]], (%{{.*}}) %a = load %i32vec3* %ap, align 16 %b = load %i32vec3* %bp, align 16 %x = add %i32vec3 %a, %b @@ -17,15 +17,15 @@ define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { ret void } -; CHECK: add3i32_2 define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { -; CHECK: movq -; CHECK: pinsrd -; CHECK: movq -; CHECK: pinsrd -; CHECK: paddd -; CHECK: pextrd -; CHECK: movq +; CHECK-LABEL: add3i32_2: +; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R0]] +; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: pinsrd $2, 8(%{{.*}}), %[[R1]] +; CHECK-NEXT: paddd %[[R0]], %[[R1]] +; CHECK-NEXT: pextrd $2, %[[R1]], 8(%{{.*}}) +; CHECK-NEXT: movq %[[R1]], (%{{.*}}) %a = load %i32vec3* %ap, align 8 %b = load %i32vec3* %bp, align 8 %x = add %i32vec3 %a, %b @@ -34,15 +34,15 @@ define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { } %i32vec7 = type <7 x i32> -; CHECK: add7i32 define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) { -; CHECK: movdqa -; CHECK: movdqa -; CHECK: paddd -; CHECK: paddd -; CHECK: pextrd -; CHECK: movq -; CHECK: movdqa +; CHECK-LABEL: add7i32: +; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: paddd (%{{.*}}), %[[R0]] +; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]] +; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}}) +; CHECK-NEXT: movq %[[R1]], 16(%{{.*}}) +; CHECK-NEXT: movdqa %[[R0]], (%{{.*}}) %a = load %i32vec7* %ap, align 16 %b = load %i32vec7* %bp, align 16 %x = add %i32vec7 %a, %b @@ -50,18 +50,18 @@ define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) { ret void } -; CHECK: add12i32 %i32vec12 = type <12 x i32> define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) { -; CHECK: movdqa -; CHECK: movdqa -; CHECK: movdqa -; CHECK: paddd -; CHECK: paddd -; CHECK: paddd -; CHECK: movdqa -; CHECK: movdqa -; CHECK: movdqa +; CHECK-LABEL: add12i32: +; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]] +; CHECK-NEXT: paddd (%{{.*}}), %[[R0]] +; CHECK-NEXT: paddd 16(%{{.*}}), %[[R1]] +; CHECK-NEXT: paddd 32(%{{.*}}), %[[R2]] +; CHECK-NEXT: movdqa %[[R2]], 32(%{{.*}}) +; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}}) +; CHECK-NEXT: movdqa %[[R0]], (%{{.*}}) %a = load %i32vec12* %ap, align 16 %b = load %i32vec12* %bp, align 16 %x = add %i32vec12 %a, %b @@ -70,11 +70,17 @@ define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) { } -; CHECK: add3i16 %i16vec3 = type <3 x i16> define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind { -; CHECK: paddd -; CHECK: ret +; CHECK-LABEL: add3i16: +; CHECK: pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: paddd %[[R0]], %[[R1]] +; CHECK-NEXT: movdqa %[[R1]], %[[R0]] +; CHECK-NEXT: pshufb {{.*}}, %[[R0]] +; CHECK-NEXT: pmovzxdq %[[R0]], %[[R0]] +; CHECK-NEXT: pextrw $4, %[[R1]], 4(%{{.*}}) +; CHECK-NEXT: movd %[[R0]], (%{{.*}}) %a = load %i16vec3* %ap, align 16 %b = load %i16vec3* %bp, align 16 %x = add %i16vec3 %a, %b @@ -82,11 +88,13 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp ret void } -; CHECK: add4i16 %i16vec4 = type <4 x i16> define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind { -; CHECK: paddd -; CHECK: movq +; CHECK-LABEL: add4i16: +; CHECK: movq (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: movq (%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: paddw %[[R0]], %[[R1]] +; CHECK-NEXT: movq %[[R1]], (%{{.*}}) %a = load %i16vec4* %ap, align 16 %b = load %i16vec4* %bp, align 16 %x = add %i16vec4 %a, %b @@ -94,15 +102,15 @@ define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp ret void } -; CHECK: add12i16 %i16vec12 = type <12 x i16> define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind { -; CHECK: movdqa -; CHECK: movdqa -; CHECK: paddw -; CHECK: paddw -; CHECK: movq -; CHECK: movdqa +; CHECK-LABEL: add12i16: +; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: paddw (%{{.*}}), %[[R0]] +; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]] +; CHECK-NEXT: movq %[[R1]], 16(%{{.*}}) +; CHECK-NEXT: movdqa %[[R0]], (%{{.*}}) %a = load %i16vec12* %ap, align 16 %b = load %i16vec12* %bp, align 16 %x = add %i16vec12 %a, %b @@ -110,18 +118,18 @@ define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* ret void } -; CHECK: add18i16 %i16vec18 = type <18 x i16> define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind { -; CHECK: movdqa -; CHECK: movdqa -; CHECK: movdqa -; CHECK: paddw -; CHECK: paddw -; CHECK: paddw -; CHECK: movd -; CHECK: movdqa -; CHECK: movdqa +; CHECK-LABEL: add18i16: +; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: movdqa 32(%{{.*}}), %[[R2:xmm[0-9]+]] +; CHECK-NEXT: paddw (%{{.*}}), %[[R0]] +; CHECK-NEXT: paddw 16(%{{.*}}), %[[R1]] +; CHECK-NEXT: paddw 32(%{{.*}}), %[[R2]] +; CHECK-NEXT: movd %[[R2]], 32(%{{.*}}) +; CHECK-NEXT: movdqa %[[R1]], 16(%{{.*}}) +; CHECK-NEXT: movdqa %[[R0]], (%{{.*}}) %a = load %i16vec18* %ap, align 16 %b = load %i16vec18* %bp, align 16 %x = add %i16vec18 %a, %b @@ -130,11 +138,18 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* } -; CHECK: add3i8 %i8vec3 = type <3 x i8> define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind { -; CHECK: paddd -; CHECK: ret +; CHECK-LABEL: add3i8: +; CHECK: pmovzxbd (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: pmovzxbd (%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: paddd %[[R0]], %[[R1]] +; CHECK-NEXT: movdqa %[[R1]], %[[R0]] +; CHECK-NEXT: pshufb {{.*}}, %[[R0]] +; CHECK-NEXT: pmovzxwq %[[R0]], %[[R0]] +; CHECK-NEXT: pextrb $8, %[[R1]], 2(%{{.*}}) +; CHECK-NEXT: movd %[[R0]], %e[[R2:[abcd]]]x +; CHECK-NEXT: movw %[[R2]]x, (%{{.*}}) %a = load %i8vec3* %ap, align 16 %b = load %i8vec3* %bp, align 16 %x = add %i8vec3 %a, %b @@ -142,17 +157,18 @@ define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) no ret void } -; CHECK-LABEL: add31i8: %i8vec31 = type <31 x i8> define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind { -; CHECK: movdqa -; CHECK: movdqa -; CHECK: paddb -; CHECK: paddb -; CHECK: pextrb -; CHECK: pextrw -; CHECK: movq -; CHECK: ret +; CHECK-LABEL: add31i8: +; CHECK: movdqa (%{{.*}}), %[[R0:xmm[0-9]+]] +; CHECK-NEXT: movdqa 16(%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK-NEXT: paddb (%{{.*}}), %[[R0]] +; CHECK-NEXT: paddb 16(%{{.*}}), %[[R1]] +; CHECK-NEXT: pextrb $14, %[[R1]], 30(%{{.*}}) +; CHECK-NEXT: pextrw $6, %[[R1]], 28(%{{.*}}) +; CHECK-NEXT: pextrd $2, %[[R1]], 24(%{{.*}}) +; CHECK-NEXT: movq %[[R1]], 16(%{{.*}}) +; CHECK-NEXT: movdqa %[[R0]], (%{{.*}}) %a = load %i8vec31* %ap, align 16 %b = load %i8vec31* %bp, align 16 %x = add %i8vec31 %a, %b @@ -161,14 +177,43 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp } -; CHECK: rot %i8vec3pack = type { <3 x i8>, i8 } -define %i8vec3pack @rot() nounwind { -; CHECK: pmovzxbd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}} +define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind { +; CHECK-LABEL: rot: +; CHECK: movdqa {{.*}}, %[[CONSTANT0:xmm[0-9]+]] +; CHECK-NEXT: movdqa {{.*}}, %[[SHUFFLE_MASK:xmm[0-9]+]] +; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT0]] +; CHECK-NEXT: pmovzxwq %[[CONSTANT0]], %[[CONSTANT0]] +; CHECK-NEXT: movd %[[CONSTANT0]], %e[[R0:[abcd]]]x +; CHECK-NEXT: movw %[[R0]]x, (%[[PTR0:.*]]) +; CHECK-NEXT: movb $-98, 2(%[[PTR0]]) +; CHECK-NEXT: movdqa {{.*}}, %[[CONSTANT1:xmm[0-9]+]] +; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT1]] +; CHECK-NEXT: pmovzxwq %[[CONSTANT1]], %[[CONSTANT1]] +; CHECK-NEXT: movd %[[CONSTANT1]], %e[[R1:[abcd]]]x +; CHECK-NEXT: movw %[[R1]]x, (%[[PTR1:.*]]) +; CHECK-NEXT: movb $1, 2(%[[PTR1]]) +; CHECK-NEXT: pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]] +; CHECK-NEXT: pand {{.*}}, %[[X0]] +; CHECK-NEXT: pextrd $1, %[[X0]], %e[[R0:[abcd]]]x +; CHECK-NEXT: shrl %e[[R0]]x +; CHECK-NEXT: movd %[[X0]], %e[[R1:[abcd]]]x +; CHECK-NEXT: shrl %e[[R1]]x +; CHECK-NEXT: movd %e[[R1]]x, %[[X1:xmm[0-9]+]] +; CHECK-NEXT: pinsrd $1, %e[[R0]]x, %[[X1]] +; CHECK-NEXT: pextrd $2, %[[X0]], %e[[R0:[abcd]]]x +; CHECK-NEXT: shrl %e[[R0]]x +; CHECK-NEXT: pinsrd $2, %e[[R0]]x, %[[X1]] +; CHECK-NEXT: pextrd $3, %[[X0]], %e[[R0:[abcd]]]x +; CHECK-NEXT: pinsrd $3, %e[[R0]]x, %[[X1]] +; CHECK-NEXT: movdqa %[[X1]], %[[X2:xmm[0-9]+]] +; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[X2]] +; CHECK-NEXT: pmovzxwq %[[X2]], %[[X3:xmm[0-9]+]] +; CHECK-NEXT: pextrb $8, %[[X1]], 2(%{{.*}}) +; CHECK-NEXT: movd %[[X3]], %e[[R0:[abcd]]]x +; CHECK-NEXT: movw %[[R0]]x, (%{{.*}}) + entry: - %X = alloca %i8vec3pack, align 4 - %rot = alloca %i8vec3pack, align 4 - %result = alloca %i8vec3pack, align 4 %storetmp = bitcast %i8vec3pack* %X to <3 x i8>* store <3 x i8> <i8 -98, i8 -98, i8 -98>, <3 x i8>* %storetmp %storetmp1 = bitcast %i8vec3pack* %rot to <3 x i8>* @@ -180,7 +225,6 @@ entry: %shr = lshr <3 x i8> %extractVec, %extractVec3 %storetmp4 = bitcast %i8vec3pack* %result to <3 x i8>* store <3 x i8> %shr, <3 x i8>* %storetmp4 - %tmp5 = load %i8vec3pack* %result - ret %i8vec3pack %tmp5 + ret void } diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll index a355b75fafcf..70fdbb7c9c82 100644 --- a/test/CodeGen/X86/widen_shuffle-1.ll +++ b/test/CodeGen/X86/widen_shuffle-1.ll @@ -1,43 +1,56 @@ ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s +target triple = "x86_64-unknown-unknown" + ; widening shuffle v3float and then a add define void @shuf(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind { -entry: ; CHECK-LABEL: shuf: -; CHECK: extractps -; CHECK: extractps +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addps %xmm1, %xmm0 +; CHECK-NEXT: extractps $2, %xmm0, 8(%eax) +; CHECK-NEXT: extractps $1, %xmm0, 4(%eax) +; CHECK-NEXT: movss %xmm0, (%eax) +; CHECK-NEXT: retl +entry: %x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 1, i32 2> %val = fadd <3 x float> %x, %src2 store <3 x float> %val, <3 x float>* %dst.addr ret void -; CHECK: ret } ; widening shuffle v3float with a different mask and then a add define void @shuf2(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind { -entry: ; CHECK-LABEL: shuf2: -; CHECK: extractps -; CHECK: extractps +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; CHECK-NEXT: addps %xmm1, %xmm0 +; CHECK-NEXT: extractps $2, %xmm0, 8(%eax) +; CHECK-NEXT: extractps $1, %xmm0, 4(%eax) +; CHECK-NEXT: movss %xmm0, (%eax) +; CHECK-NEXT: retl +entry: %x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 4, i32 2> %val = fadd <3 x float> %x, %src2 store <3 x float> %val, <3 x float>* %dst.addr ret void -; CHECK: ret } ; Example of when widening a v3float operation causes the DAG to replace a node ; with the operation that we are currently widening, i.e. when replacing ; opA with opB, the DAG will produce new operations with opA. define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind { -entry: ; CHECK-LABEL: shuf3: -; CHECK-NOT: movlhps -; CHECK-NOT: shufps -; CHECK: pshufd +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; CHECK-NEXT: movaps %xmm1, (%eax) +; CHECK-NEXT: retl +entry: %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5> - %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> + %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> %tmp3.i13 = shufflevector <4 x float> %tmp1.i.i, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> ; <<3 x float>> %tmp6.i14 = shufflevector <3 x float> %tmp3.i13, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> @@ -45,27 +58,35 @@ entry: %tmp2.i18 = shufflevector <3 x float> %tmp97.i, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> %t5 = bitcast <4 x float> %tmp2.i18 to <4 x i32> %shr.i.i19 = lshr <4 x i32> %t5, <i32 19, i32 19, i32 19, i32 19> - %and.i.i20 = and <4 x i32> %shr.i.i19, <i32 4080, i32 4080, i32 4080, i32 4080> + %and.i.i20 = and <4 x i32> %shr.i.i19, <i32 4080, i32 4080, i32 4080, i32 4080> %shuffle.i.i.i21 = shufflevector <4 x float> %tmp2.i18, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3> store <4 x float> %shuffle.i.i.i21, <4 x float>* %dst ret void -; CHECK: ret } ; PR10421: make sure we correctly handle extreme widening with CONCAT_VECTORS define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone { ; CHECK-LABEL: shuf4: -; CHECK-NOT: punpckldq +; CHECK: # BB#0: +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; CHECK-NEXT: pshufb %xmm2, %xmm1 +; CHECK-NEXT: pshufb %xmm2, %xmm0 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: retl %vshuf = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x i8> %vshuf -; CHECK: ret } ; PR11389: another CONCAT_VECTORS case define void @shuf5(<8 x i8>* %p) nounwind { ; CHECK-LABEL: shuf5: +; CHECK: # BB#0: +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <4,33,u,u,u,u,u,u> +; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; CHECK-NEXT: movlpd %xmm0, (%eax) +; CHECK-NEXT: retl %v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> store <8 x i8> %v, <8 x i8>* %p, align 8 ret void -; CHECK: ret } diff --git a/test/CodeGen/X86/win32-pic-jumptable.ll b/test/CodeGen/X86/win32-pic-jumptable.ll new file mode 100644 index 000000000000..cabd36ae395d --- /dev/null +++ b/test/CodeGen/X86/win32-pic-jumptable.ll @@ -0,0 +1,36 @@ +; RUN: llc < %s -relocation-model=pic | FileCheck %s + +; CHECK: calll L0$pb +; CHECK-NEXT: L0$pb: +; CHECK-NEXT: popl %eax +; CHECK-NEXT: addl LJTI0_0(,%ecx,4), %eax +; CHECK-NEXT: jmpl *%eax + +; CHECK: LJTI0_0: +; CHECK-NEXT: .long LBB0_4-L0$pb +; CHECK-NEXT: .long LBB0_5-L0$pb +; CHECK-NEXT: .long LBB0_6-L0$pb +; CHECK-NEXT: .long LBB0_7-L0$pb + + +target triple = "i686--windows-itanium" +define i32 @f(i64 %x) { +bb0: + switch i64 %x, label %bb5 [ + i64 1, label %bb1 + i64 2, label %bb2 + i64 3, label %bb3 + i64 4, label %bb4 + ] +bb1: + br label %bb5 +bb2: + br label %bb5 +bb3: + br label %bb5 +bb4: + br label %bb5 +bb5: + %y = phi i32 [ 0, %bb0 ], [ 1, %bb1 ], [ 2, %bb2 ], [ 3, %bb3 ], [ 4, %bb4 ] + ret i32 %y +} diff --git a/test/CodeGen/X86/win64_call_epi.ll b/test/CodeGen/X86/win64_call_epi.ll new file mode 100644 index 000000000000..71c44b085004 --- /dev/null +++ b/test/CodeGen/X86/win64_call_epi.ll @@ -0,0 +1,65 @@ +; RUN: llc < %s -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=WIN64 + +declare void @bar() +declare void @baz() +declare i32 @personality(...) + +; Check for 'nop' between the last call and the epilogue. +define void @foo1() { + + invoke void @bar() + to label %normal + unwind label %catch + +normal: + ret void + +catch: + %1 = landingpad { i8*, i32 } personality i32 (...)* @personality cleanup + resume { i8*, i32 } %1 +} +; WIN64-LABEL: foo1: +; WIN64: .seh_proc foo1 +; WIN64: callq bar +; WIN64: nop +; WIN64: addq ${{[0-9]+}}, %rsp +; WIN64: retq +; Check for 'ud2' after noreturn call +; WIN64: callq _Unwind_Resume +; WIN64-NEXT: ud2 +; WIN64: .seh_endproc + + +; Check it still works when blocks are reordered. +@something = global i32 0 +define void @foo2(i1 zeroext %cond ) { + br i1 %cond, label %a, label %b, !prof !0 +a: + call void @bar() + br label %done +b: + call void @baz() + store i32 0, i32* @something + br label %done +done: + ret void +} +!0 = !{!"branch_weights", i32 100, i32 0} +; WIN64-LABEL: foo2: +; WIN64: callq bar +; WIN64: nop +; WIN64: addq ${{[0-9]+}}, %rsp +; WIN64: retq + + +; Check nop is not emitted when call is not adjacent to epilogue. +define i32 @foo3() { + call void @bar() + ret i32 0 +} +; WIN64-LABEL: foo3: +; WIN64: callq bar +; WIN64: xorl +; WIN64-NOT: nop +; WIN64: addq ${{[0-9]+}}, %rsp +; WIN64: retq diff --git a/test/CodeGen/X86/win64_eh.ll b/test/CodeGen/X86/win64_eh.ll index f1f874eb2f5a..4670087b9b4d 100644 --- a/test/CodeGen/X86/win64_eh.ll +++ b/test/CodeGen/X86/win64_eh.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -O0 -mcpu=corei7 -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN64 +; RUN: llc < %s -O0 -mcpu=corei7 -mtriple=x86_64-pc-windows-itanium | FileCheck %s -check-prefix=WIN64 ; RUN: llc < %s -O0 -mcpu=corei7 -mtriple=x86_64-pc-mingw32 | FileCheck %s -check-prefix=WIN64 ; Check function without prolog diff --git a/test/CodeGen/X86/win64_vararg.ll b/test/CodeGen/X86/win64_vararg.ll index 1a51b2a64a76..8d7f2010a541 100644 --- a/test/CodeGen/X86/win64_vararg.ll +++ b/test/CodeGen/X86/win64_vararg.ll @@ -111,3 +111,22 @@ entry: %tmp = va_arg i8** %ap, i32 ret i32 %tmp } + +define void @sret_arg(i32* sret %agg.result, i8* nocapture readnone %format, ...) { +entry: + %ap = alloca i8* + %ap_i8 = bitcast i8** %ap to i8* + call void @llvm.va_start(i8* %ap_i8) + %tmp = va_arg i8** %ap, i32 + store i32 %tmp, i32* %agg.result + ret void +} +; CHECK-LABEL: sret_arg: +; CHECK: pushq +; CHECK-DAG: movq %r9, 40(%rsp) +; CHECK-DAG: movq %r8, 32(%rsp) +; CHECK: movl 32(%rsp), %[[tmp:[^ ]*]] +; CHECK: movl %[[tmp]], (%[[sret:[^ ]*]]) +; CHECK: movq %[[sret]], %rax +; CHECK: popq +; CHECK: retq diff --git a/test/CodeGen/X86/windows-itanium-alloca.ll b/test/CodeGen/X86/windows-itanium-alloca.ll new file mode 100644 index 000000000000..0a06cdef8793 --- /dev/null +++ b/test/CodeGen/X86/windows-itanium-alloca.ll @@ -0,0 +1,16 @@ +; RUN: llc -mtriple i686-windows-itanium -filetype asm -o - %s | FileCheck %s + +target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32" +target triple = "i686--windows-itanium" + +declare void @external(i8*) + +define dllexport void @alloca(i32 %sz) { +entry: + %vla = alloca i8, i32 %sz, align 1 + call void @external(i8* %vla) + ret void +} + +; CHECK: __chkstk + diff --git a/test/CodeGen/X86/x32-function_pointer-1.ll b/test/CodeGen/X86/x32-function_pointer-1.ll new file mode 100644 index 000000000000..2baf92a99790 --- /dev/null +++ b/test/CodeGen/X86/x32-function_pointer-1.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s + +; Test for x32 function pointer tail call + +@foo1 = external global void (i8*)* +@foo2 = external global void (i8*)* + +define void @bar(i8* %h) nounwind uwtable { +entry: + %0 = load void (i8*)** @foo1, align 4 +; CHECK: movl foo1(%rip), %e{{[^,]*}} + tail call void %0(i8* %h) nounwind +; CHECK: callq *%r{{[^,]*}} + %1 = load void (i8*)** @foo2, align 4 +; CHECK: movl foo2(%rip), %e{{[^,]*}} + tail call void %1(i8* %h) nounwind +; CHECK: jmpq *%r{{[^,]*}} + ret void +} diff --git a/test/CodeGen/X86/x32-function_pointer-2.ll b/test/CodeGen/X86/x32-function_pointer-2.ll new file mode 100644 index 000000000000..f727d41be3a3 --- /dev/null +++ b/test/CodeGen/X86/x32-function_pointer-2.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s + +; Test call function pointer with function argument +; +; void bar (void * h, void (*foo) (void *)) +; { +; foo (h); +; foo (h); +; } + + +define void @bar(i8* %h, void (i8*)* nocapture %foo) nounwind { +entry: + tail call void %foo(i8* %h) nounwind +; CHECK: mov{{l|q}} %{{e|r}}si, %{{e|r}}[[REG:.*]]{{d?}} +; CHECK: callq *%r[[REG]] + tail call void %foo(i8* %h) nounwind +; CHECK: jmpq *%r{{[^,]*}} + ret void +} diff --git a/test/CodeGen/X86/x32-function_pointer-3.ll b/test/CodeGen/X86/x32-function_pointer-3.ll new file mode 100644 index 000000000000..5eaf85d8f931 --- /dev/null +++ b/test/CodeGen/X86/x32-function_pointer-3.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -fast-isel | FileCheck %s + +; Test calling function pointer passed in struct + +; The fuction argument `h' in + +; struct foo { +; void (*f) (void); +; int i; +; }; +; void +; bar (struct foo h) +; { +; h.f (); +; } + +; is passed in the 64-bit %rdi register. The `f' field is in the lower 32 +; bits of %rdi register and the `i' field is in the upper 32 bits of %rdi +; register. We need to zero-extend %edi to %rdi before branching via %rdi. + +define void @bar(i64 %h.coerce) nounwind { +entry: + %h.sroa.0.0.extract.trunc = trunc i64 %h.coerce to i32 + %0 = inttoptr i32 %h.sroa.0.0.extract.trunc to void ()* +; CHECK: movl %edi, %e[[REG:.*]] + tail call void %0() nounwind +; CHECK: jmpq *%r[[REG]] + ret void +} diff --git a/test/CodeGen/X86/x86-64-call.ll b/test/CodeGen/X86/x86-64-call.ll new file mode 100644 index 000000000000..300f8d1025e5 --- /dev/null +++ b/test/CodeGen/X86/x86-64-call.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-linux -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mcpu=generic -mtriple=x86_64-pc-linux-gnux32 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mcpu=generic -mtriple=i686-pc-linux -verify-machineinstrs | FileCheck %s -check-prefix=IA32 + +; trivial test for correct call suffix + +define i32 @far() nounwind uwtable { +entry: +; CHECK: callq foo +; IA32: calll foo + tail call void @foo() nounwind + ret i32 0 +} + +declare void @foo() diff --git a/test/CodeGen/X86/x86-64-pic-10.ll b/test/CodeGen/X86/x86-64-pic-10.ll index da8082b92518..8790fa6072b3 100644 --- a/test/CodeGen/X86/x86-64-pic-10.ll +++ b/test/CodeGen/X86/x86-64-pic-10.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic -o %t1 ; RUN: grep "callq g@PLT" %t1 -@g = alias weak i32 ()* @f +@g = weak alias i32 ()* @f define void @h() { entry: diff --git a/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll new file mode 100644 index 000000000000..c476ffd84053 --- /dev/null +++ b/test/CodeGen/X86/x86-64-stack-and-frame-ptr.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s +; RUN: llc -mtriple=x86_64-pc-nacl < %s | FileCheck -check-prefix=NACL %s + +; x32 uses %esp, %ebp as stack and frame pointers + +; CHECK-LABEL: foo +; CHECK: pushq %rbp +; CHECK: movq %rsp, %rbp +; CHECK: movq %rdi, -8(%rbp) +; CHECK: popq %rbp +; X32ABI-LABEL: foo +; X32ABI: pushq %rbp +; X32ABI: movl %esp, %ebp +; X32ABI: movl %edi, -4(%ebp) +; X32ABI: popq %rbp +; NACL-LABEL: foo +; NACL: pushq %rbp +; NACL: movq %rsp, %rbp +; NACL: movl %edi, -4(%rbp) +; NACL: popq %rbp + + +define void @foo(i32* %a) #0 { +entry: + %a.addr = alloca i32*, align 4 + %b = alloca i32*, align 4 + store i32* %a, i32** %a.addr, align 4 + ret void +} + +attributes #0 = { nounwind uwtable "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"} + + diff --git a/test/CodeGen/X86/x86-64-tls-1.ll b/test/CodeGen/X86/x86-64-tls-1.ll index 641786f5a914..2879fb4e1e74 100644 --- a/test/CodeGen/X86/x86-64-tls-1.ll +++ b/test/CodeGen/X86/x86-64-tls-1.ll @@ -1,10 +1,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s @tm_nest_level = internal thread_local global i32 0 define i64 @z() nounwind { -; FIXME: The codegen here is primitive at best and could be much better. -; The add and the moves can be folded together. -; CHECK-DAG: movq $tm_nest_level@TPOFF, %rcx -; CHECK-DAG: movq %fs:0, %rax -; CHECK: addl %ecx, %eax +; CHECK: movq $tm_nest_level@TPOFF, %r[[R0:[abcd]]]x +; CHECK-NEXT: addl %fs:0, %e[[R0]]x +; CHECK-NEXT: andq $100, %r[[R0]]x + ret i64 and (i64 ptrtoint (i32* @tm_nest_level to i64), i64 100) } diff --git a/test/CodeGen/X86/x86-inline-asm-validation.ll b/test/CodeGen/X86/x86-inline-asm-validation.ll new file mode 100644 index 000000000000..56bdc48b0e4c --- /dev/null +++ b/test/CodeGen/X86/x86-inline-asm-validation.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple i686-gnu -filetype asm -o - %s 2>&1 | FileCheck %s + +define void @test_L_ff() { +entry: + call void asm "", "L,~{dirflag},~{fpsr},~{flags}"(i32 255) + ret void +} + +; CHECK-NOT: error: invalid operand for inline asm constraint 'L' + +define void @test_L_ffff() { +entry: + call void asm "", "L,~{dirflag},~{fpsr},~{flags}"(i32 65535) + ret void +} + +; CHECK-NOT: error: invalid operand for inline asm constraint 'L' + +define void @test_M_1() { +entry: + call void asm "", "M,~{dirflag},~{fpsr},~{flags}"(i32 1) + ret void +} + +; CHECK-NOT: error: invalid operand for inline asm constraint 'M' + +define void @test_O_64() { +entry: + call void asm "", "O,~{dirflag},~{fpsr},~{flags}"(i32 64) + ret void +} + +; CHECK-NOT: error: invalid operand for inline asm constraint 'O' + diff --git a/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll b/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll new file mode 100644 index 000000000000..fcf7eaec0544 --- /dev/null +++ b/test/CodeGen/X86/x86-mixed-alignment-dagcombine.ll @@ -0,0 +1,35 @@ +; RUN: llc -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 -mattr=+64bit,+sse2 < %s | FileCheck %s + +; DAGCombine may choose to rewrite 2 loads feeding a select as a select of +; addresses feeding a load. This test ensures that when it does that it creates +; a load with alignment equivalent to the most restrictive source load. + +declare void @sink(<2 x double>) + +define void @test1(i1 %cmp) align 2 { + %1 = alloca <2 x double>, align 16 + %2 = alloca <2 x double>, align 8 + + %val = load <2 x double>* %1, align 16 + %val2 = load <2 x double>* %2, align 8 + %val3 = select i1 %cmp, <2 x double> %val, <2 x double> %val2 + call void @sink(<2 x double> %val3) + ret void + ; CHECK: test1 + ; CHECK: movups + ; CHECK: ret +} + +define void @test2(i1 %cmp) align 2 { + %1 = alloca <2 x double>, align 16 + %2 = alloca <2 x double>, align 8 + + %val = load <2 x double>* %1, align 16 + %val2 = load <2 x double>* %2, align 16 + %val3 = select i1 %cmp, <2 x double> %val, <2 x double> %val2 + call void @sink(<2 x double> %val3) + ret void + ; CHECK: test2 + ; CHECK: movaps + ; CHECK: ret +} diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll index f737519bd153..4317d8ab6a26 100644 --- a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll +++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll @@ -2,10 +2,10 @@ define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind { ; CHECK-LABEL: LCPI0_0: -; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00 -; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00 -; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00 -; CHECK-NEXT: .long 1065353216 ## float 1.000000e+00 +; CHECK-NEXT: .long 1065353216 ## 0x3f800000 +; CHECK-NEXT: .long 1065353216 ## 0x3f800000 +; CHECK-NEXT: .long 1065353216 ## 0x3f800000 +; CHECK-NEXT: .long 1065353216 ## 0x3f800000 ; CHECK-LABEL: foo: ; CHECK: cmpeqps %xmm1, %xmm0 ; CHECK-NEXT: andps LCPI0_0(%rip), %xmm0 @@ -54,3 +54,21 @@ define void @foo2(<4 x float>* noalias %result) nounwind { store <4 x float> %val, <4 x float>* %result ret void } + +; Fold explicit AND operations when the constant isn't a splat of a single +; scalar value like what the zext creates. +define <4 x float> @foo3(<4 x float> %val, <4 x float> %test) nounwind { +; CHECK-LABEL: LCPI3_0: +; CHECK-NEXT: .long 1065353216 ## 0x3f800000 +; CHECK-NEXT: .long 0 ## 0x0 +; CHECK-NEXT: .long 1065353216 ## 0x3f800000 +; CHECK-NEXT: .long 0 ## 0x0 +; CHECK-LABEL: foo3: +; CHECK: cmpeqps %xmm1, %xmm0 +; CHECK-NEXT: andps LCPI3_0(%rip), %xmm0 + %cmp = fcmp oeq <4 x float> %val, %test + %ext = zext <4 x i1> %cmp to <4 x i32> + %and = and <4 x i32> %ext, <i32 255, i32 256, i32 257, i32 258> + %result = sitofp <4 x i32> %and to <4 x float> + ret <4 x float> %result +} diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll index f078631c2b33..668628c69ede 100644 --- a/test/CodeGen/X86/xaluo.ll +++ b/test/CodeGen/X86/xaluo.ll @@ -1,7 +1,5 @@ -; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s --check-prefix=DAG -; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST -; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s -; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG +; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FAST ; ; Get the actual value of the overflow bit. @@ -9,12 +7,9 @@ ; SADDO reg, reg define zeroext i1 @saddo.i8(i8 signext %v1, i8 signext %v2, i8* %res) { entry: -; DAG-LABEL: saddo.i8 -; DAG: addb %sil, %dil -; DAG-NEXT: seto %al -; FAST-LABEL: saddo.i8 -; FAST: addb %sil, %dil -; FAST-NEXT: seto %al +; CHECK-LABEL: saddo.i8 +; CHECK: addb %sil, %dil +; CHECK-NEXT: seto %al %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 %v2) %val = extractvalue {i8, i1} %t, 0 %obit = extractvalue {i8, i1} %t, 1 @@ -24,12 +19,9 @@ entry: define zeroext i1 @saddo.i16(i16 %v1, i16 %v2, i16* %res) { entry: -; DAG-LABEL: saddo.i16 -; DAG: addw %si, %di -; DAG-NEXT: seto %al -; FAST-LABEL: saddo.i16 -; FAST: addw %si, %di -; FAST-NEXT: seto %al +; CHECK-LABEL: saddo.i16 +; CHECK: addw %si, %di +; CHECK-NEXT: seto %al %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 %v2) %val = extractvalue {i16, i1} %t, 0 %obit = extractvalue {i16, i1} %t, 1 @@ -39,12 +31,9 @@ entry: define zeroext i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) { entry: -; DAG-LABEL: saddo.i32 -; DAG: addl %esi, %edi -; DAG-NEXT: seto %al -; FAST-LABEL: saddo.i32 -; FAST: addl %esi, %edi -; FAST-NEXT: seto %al +; CHECK-LABEL: saddo.i32 +; CHECK: addl %esi, %edi +; CHECK-NEXT: seto %al %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -54,12 +43,9 @@ entry: define zeroext i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) { entry: -; DAG-LABEL: saddo.i64 -; DAG: addq %rsi, %rdi -; DAG-NEXT: seto %al -; FAST-LABEL: saddo.i64 -; FAST: addq %rsi, %rdi -; FAST-NEXT: seto %al +; CHECK-LABEL: saddo.i64 +; CHECK: addq %rsi, %rdi +; CHECK-NEXT: seto %al %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -67,16 +53,48 @@ entry: ret i1 %obit } -; SADDO reg, imm | imm, reg -; FIXME: INC isn't supported in FastISel yet -define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) { +; SADDO reg, 1 | INC +define zeroext i1 @saddo.inc.i8(i8 %v1, i8* %res) { +entry: +; CHECK-LABEL: saddo.inc.i8 +; CHECK: incb %dil +; CHECK-NEXT: seto %al + %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 1) + %val = extractvalue {i8, i1} %t, 0 + %obit = extractvalue {i8, i1} %t, 1 + store i8 %val, i8* %res + ret i1 %obit +} + +define zeroext i1 @saddo.inc.i16(i16 %v1, i16* %res) { +entry: +; CHECK-LABEL: saddo.inc.i16 +; CHECK: incw %di +; CHECK-NEXT: seto %al + %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 1) + %val = extractvalue {i16, i1} %t, 0 + %obit = extractvalue {i16, i1} %t, 1 + store i16 %val, i16* %res + ret i1 %obit +} + +define zeroext i1 @saddo.inc.i32(i32 %v1, i32* %res) { entry: -; DAG-LABEL: saddo.i64imm1 -; DAG: incq %rdi -; DAG-NEXT: seto %al -; FAST-LABEL: saddo.i64imm1 -; FAST: addq $1, %rdi -; FAST-NEXT: seto %al +; CHECK-LABEL: saddo.inc.i32 +; CHECK: incl %edi +; CHECK-NEXT: seto %al + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 1) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, i32* %res + ret i1 %obit +} + +define zeroext i1 @saddo.inc.i64(i64 %v1, i64* %res) { +entry: +; CHECK-LABEL: saddo.inc.i64 +; CHECK: incq %rdi +; CHECK-NEXT: seto %al %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 1) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -84,17 +102,18 @@ entry: ret i1 %obit } +; SADDO reg, imm | imm, reg ; FIXME: DAG doesn't optimize immediates on the LHS. -define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) { +define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) { entry: -; DAG-LABEL: saddo.i64imm2 -; DAG: mov -; DAG-NEXT: addq -; DAG-NEXT: seto -; FAST-LABEL: saddo.i64imm2 -; FAST: addq $1, %rdi -; FAST-NEXT: seto %al - %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 1, i64 %v1) +; SDAG-LABEL: saddo.i64imm1 +; SDAG: mov +; SDAG-NEXT: addq +; SDAG-NEXT: seto +; FAST-LABEL: saddo.i64imm1 +; FAST: addq $2, %rdi +; FAST-NEXT: seto %al + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 2, i64 %v1) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 store i64 %val, i64* %res @@ -102,14 +121,11 @@ entry: } ; Check boundary conditions for large immediates. -define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) { +define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) { entry: -; DAG-LABEL: saddo.i64imm3 -; DAG: addq $-2147483648, %rdi -; DAG-NEXT: seto %al -; FAST-LABEL: saddo.i64imm3 -; FAST: addq $-2147483648, %rdi -; FAST-NEXT: seto %al +; CHECK-LABEL: saddo.i64imm2 +; CHECK: addq $-2147483648, %rdi +; CHECK-NEXT: seto %al %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -2147483648) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -117,16 +133,12 @@ entry: ret i1 %obit } -define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) { +define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) { entry: -; DAG-LABEL: saddo.i64imm4 -; DAG: movabsq $-21474836489, %[[REG:[a-z]+]] -; DAG-NEXT: addq %rdi, %[[REG]] -; DAG-NEXT: seto -; FAST-LABEL: saddo.i64imm4 -; FAST: movabsq $-21474836489, %[[REG:[a-z]+]] -; FAST-NEXT: addq %rdi, %[[REG]] -; FAST-NEXT: seto +; CHECK-LABEL: saddo.i64imm3 +; CHECK: movabsq $-21474836489, %[[REG:[a-z]+]] +; CHECK-NEXT: addq %rdi, %[[REG]] +; CHECK-NEXT: seto %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -21474836489) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -134,14 +146,11 @@ entry: ret i1 %obit } -define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) { +define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) { entry: -; DAG-LABEL: saddo.i64imm5 -; DAG: addq $2147483647, %rdi -; DAG-NEXT: seto -; FAST-LABEL: saddo.i64imm5 -; FAST: addq $2147483647, %rdi -; FAST-NEXT: seto +; CHECK-LABEL: saddo.i64imm4 +; CHECK: addq $2147483647, %rdi +; CHECK-NEXT: seto %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483647) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -149,17 +158,12 @@ entry: ret i1 %obit } -; TODO: FastISel shouldn't use movabsq. -define zeroext i1 @saddo.i64imm6(i64 %v1, i64* %res) { +define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) { entry: -; DAG-LABEL: saddo.i64imm6 -; DAG: movl $2147483648, %ecx -; DAG: addq %rdi, %rcx -; DAG-NEXT: seto -; FAST-LABEL: saddo.i64imm6 -; FAST: movabsq $2147483648, %[[REG:[a-z]+]] -; FAST: addq %rdi, %[[REG]] -; FAST-NEXT: seto +; CHECK-LABEL: saddo.i64imm5 +; CHECK: movl $2147483648 +; CHECK: addq %rdi +; CHECK-NEXT: seto %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483648) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -170,12 +174,9 @@ entry: ; UADDO define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) { entry: -; DAG-LABEL: uaddo.i32 -; DAG: addl %esi, %edi -; DAG-NEXT: setb %al -; FAST-LABEL: uaddo.i32 -; FAST: addl %esi, %edi -; FAST-NEXT: setb %al +; CHECK-LABEL: uaddo.i32 +; CHECK: addl %esi, %edi +; CHECK-NEXT: setb %al %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -185,12 +186,9 @@ entry: define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) { entry: -; DAG-LABEL: uaddo.i64 -; DAG: addq %rsi, %rdi -; DAG-NEXT: setb %al -; FAST-LABEL: uaddo.i64 -; FAST: addq %rsi, %rdi -; FAST-NEXT: setb %al +; CHECK-LABEL: uaddo.i64 +; CHECK: addq %rsi, %rdi +; CHECK-NEXT: setb %al %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -198,15 +196,57 @@ entry: ret i1 %obit } +; UADDO reg, 1 | NOT INC +define zeroext i1 @uaddo.inc.i8(i8 %v1, i8* %res) { +entry: +; CHECK-LABEL: uaddo.inc.i8 +; CHECK-NOT: incb %dil + %t = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %v1, i8 1) + %val = extractvalue {i8, i1} %t, 0 + %obit = extractvalue {i8, i1} %t, 1 + store i8 %val, i8* %res + ret i1 %obit +} + +define zeroext i1 @uaddo.inc.i16(i16 %v1, i16* %res) { +entry: +; CHECK-LABEL: uaddo.inc.i16 +; CHECK-NOT: incw %di + %t = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %v1, i16 1) + %val = extractvalue {i16, i1} %t, 0 + %obit = extractvalue {i16, i1} %t, 1 + store i16 %val, i16* %res + ret i1 %obit +} + +define zeroext i1 @uaddo.inc.i32(i32 %v1, i32* %res) { +entry: +; CHECK-LABEL: uaddo.inc.i32 +; CHECK-NOT: incl %edi + %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 1) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, i32* %res + ret i1 %obit +} + +define zeroext i1 @uaddo.inc.i64(i64 %v1, i64* %res) { +entry: +; CHECK-LABEL: uaddo.inc.i64 +; CHECK-NOT: incq %rdi + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 1) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, i64* %res + ret i1 %obit +} + ; SSUBO define zeroext i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) { entry: -; DAG-LABEL: ssubo.i32 -; DAG: subl %esi, %edi -; DAG-NEXT: seto %al -; FAST-LABEL: ssubo.i32 -; FAST: subl %esi, %edi -; FAST-NEXT: seto %al +; CHECK-LABEL: ssubo.i32 +; CHECK: subl %esi, %edi +; CHECK-NEXT: seto %al %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -216,12 +256,9 @@ entry: define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) { entry: -; DAG-LABEL: ssubo.i64 -; DAG: subq %rsi, %rdi -; DAG-NEXT: seto %al -; FAST-LABEL: ssubo.i64 -; FAST: subq %rsi, %rdi -; FAST-NEXT: seto %al +; CHECK-LABEL: ssubo.i64 +; CHECK: subq %rsi, %rdi +; CHECK-NEXT: seto %al %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -232,12 +269,9 @@ entry: ; USUBO define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) { entry: -; DAG-LABEL: usubo.i32 -; DAG: subl %esi, %edi -; DAG-NEXT: setb %al -; FAST-LABEL: usubo.i32 -; FAST: subl %esi, %edi -; FAST-NEXT: setb %al +; CHECK-LABEL: usubo.i32 +; CHECK: subl %esi, %edi +; CHECK-NEXT: setb %al %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -247,12 +281,9 @@ entry: define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) { entry: -; DAG-LABEL: usubo.i64 -; DAG: subq %rsi, %rdi -; DAG-NEXT: setb %al -; FAST-LABEL: usubo.i64 -; FAST: subq %rsi, %rdi -; FAST-NEXT: setb %al +; CHECK-LABEL: usubo.i64 +; CHECK: subq %rsi, %rdi +; CHECK-NEXT: setb %al %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -263,10 +294,10 @@ entry: ; SMULO define zeroext i1 @smulo.i8(i8 %v1, i8 %v2, i8* %res) { entry: -; FAST-LABEL: smulo.i8 -; FAST: movb %dil, %al -; FAST-NEXT: imulb %sil -; FAST-NEXT: seto %cl +; CHECK-LABEL: smulo.i8 +; CHECK: movb %dil, %al +; CHECK-NEXT: imulb %sil +; CHECK-NEXT: seto %cl %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2) %val = extractvalue {i8, i1} %t, 0 %obit = extractvalue {i8, i1} %t, 1 @@ -276,12 +307,9 @@ entry: define zeroext i1 @smulo.i16(i16 %v1, i16 %v2, i16* %res) { entry: -; DAG-LABEL: smulo.i16 -; DAG: imulw %si, %di -; DAG-NEXT: seto %al -; FAST-LABEL: smulo.i16 -; FAST: imulw %si, %di -; FAST-NEXT: seto %al +; CHECK-LABEL: smulo.i16 +; CHECK: imulw %si, %di +; CHECK-NEXT: seto %al %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2) %val = extractvalue {i16, i1} %t, 0 %obit = extractvalue {i16, i1} %t, 1 @@ -291,12 +319,9 @@ entry: define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) { entry: -; DAG-LABEL: smulo.i32 -; DAG: imull %esi, %edi -; DAG-NEXT: seto %al -; FAST-LABEL: smulo.i32 -; FAST: imull %esi, %edi -; FAST-NEXT: seto %al +; CHECK-LABEL: smulo.i32 +; CHECK: imull %esi, %edi +; CHECK-NEXT: seto %al %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -306,12 +331,9 @@ entry: define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) { entry: -; DAG-LABEL: smulo.i64 -; DAG: imulq %rsi, %rdi -; DAG-NEXT: seto %al -; FAST-LABEL: smulo.i64 -; FAST: imulq %rsi, %rdi -; FAST-NEXT: seto %al +; CHECK-LABEL: smulo.i64 +; CHECK: imulq %rsi, %rdi +; CHECK-NEXT: seto %al %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -322,10 +344,10 @@ entry: ; UMULO define zeroext i1 @umulo.i8(i8 %v1, i8 %v2, i8* %res) { entry: -; FAST-LABEL: umulo.i8 -; FAST: movb %dil, %al -; FAST-NEXT: mulb %sil -; FAST-NEXT: seto %cl +; CHECK-LABEL: umulo.i8 +; CHECK: movb %dil, %al +; CHECK-NEXT: mulb %sil +; CHECK-NEXT: seto %cl %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2) %val = extractvalue {i8, i1} %t, 0 %obit = extractvalue {i8, i1} %t, 1 @@ -335,12 +357,9 @@ entry: define zeroext i1 @umulo.i16(i16 %v1, i16 %v2, i16* %res) { entry: -; DAG-LABEL: umulo.i16 -; DAG: mulw %si -; DAG-NEXT: seto -; FAST-LABEL: umulo.i16 -; FAST: mulw %si -; FAST-NEXT: seto +; CHECK-LABEL: umulo.i16 +; CHECK: mulw %si +; CHECK-NEXT: seto %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2) %val = extractvalue {i16, i1} %t, 0 %obit = extractvalue {i16, i1} %t, 1 @@ -350,12 +369,9 @@ entry: define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) { entry: -; DAG-LABEL: umulo.i32 -; DAG: mull %esi -; DAG-NEXT: seto -; FAST-LABEL: umulo.i32 -; FAST: mull %esi -; FAST-NEXT: seto +; CHECK-LABEL: umulo.i32 +; CHECK: mull %esi +; CHECK-NEXT: seto %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -365,12 +381,9 @@ entry: define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) { entry: -; DAG-LABEL: umulo.i64 -; DAG: mulq %rsi -; DAG-NEXT: seto -; FAST-LABEL: umulo.i64 -; FAST: mulq %rsi -; FAST-NEXT: seto +; CHECK-LABEL: umulo.i64 +; CHECK: mulq %rsi +; CHECK-NEXT: seto %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -383,9 +396,9 @@ entry: ; define i32 @saddo.select.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: saddo.select.i32 -; CHECK: addl %esi, %eax -; CHECK-NEXT: cmovol %edi, %esi +; CHECK-LABEL: saddo.select.i32 +; CHECK: addl %esi, %eax +; CHECK-NEXT: cmovol %edi, %esi %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -394,9 +407,9 @@ entry: define i64 @saddo.select.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: saddo.select.i64 -; CHECK: addq %rsi, %rax -; CHECK-NEXT: cmovoq %rdi, %rsi +; CHECK-LABEL: saddo.select.i64 +; CHECK: addq %rsi, %rax +; CHECK-NEXT: cmovoq %rdi, %rsi %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -405,9 +418,9 @@ entry: define i32 @uaddo.select.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: uaddo.select.i32 -; CHECK: addl %esi, %eax -; CHECK-NEXT: cmovbl %edi, %esi +; CHECK-LABEL: uaddo.select.i32 +; CHECK: addl %esi, %eax +; CHECK-NEXT: cmovbl %edi, %esi %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -416,9 +429,9 @@ entry: define i64 @uaddo.select.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: uaddo.select.i64 -; CHECK: addq %rsi, %rax -; CHECK-NEXT: cmovbq %rdi, %rsi +; CHECK-LABEL: uaddo.select.i64 +; CHECK: addq %rsi, %rax +; CHECK-NEXT: cmovbq %rdi, %rsi %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -427,9 +440,9 @@ entry: define i32 @ssubo.select.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: ssubo.select.i32 -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: cmovol %edi, %esi +; CHECK-LABEL: ssubo.select.i32 +; CHECK: cmpl %esi, %edi +; CHECK-NEXT: cmovol %edi, %esi %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -438,9 +451,9 @@ entry: define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: ssubo.select.i64 -; CHECK: cmpq %rsi, %rdi -; CHECK-NEXT: cmovoq %rdi, %rsi +; CHECK-LABEL: ssubo.select.i64 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovoq %rdi, %rsi %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -449,9 +462,9 @@ entry: define i32 @usubo.select.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: usubo.select.i32 -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: cmovbl %edi, %esi +; CHECK-LABEL: usubo.select.i32 +; CHECK: cmpl %esi, %edi +; CHECK-NEXT: cmovbl %edi, %esi %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -460,9 +473,9 @@ entry: define i64 @usubo.select.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: usubo.select.i64 -; CHECK: cmpq %rsi, %rdi -; CHECK-NEXT: cmovbq %rdi, %rsi +; CHECK-LABEL: usubo.select.i64 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovbq %rdi, %rsi %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -471,9 +484,9 @@ entry: define i32 @smulo.select.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: smulo.select.i32 -; CHECK: imull %esi, %eax -; CHECK-NEXT: cmovol %edi, %esi +; CHECK-LABEL: smulo.select.i32 +; CHECK: imull %esi, %eax +; CHECK-NEXT: cmovol %edi, %esi %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -482,9 +495,9 @@ entry: define i64 @smulo.select.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: smulo.select.i64 -; CHECK: imulq %rsi, %rax -; CHECK-NEXT: cmovoq %rdi, %rsi +; CHECK-LABEL: smulo.select.i64 +; CHECK: imulq %rsi, %rax +; CHECK-NEXT: cmovoq %rdi, %rsi %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -493,9 +506,9 @@ entry: define i32 @umulo.select.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: umulo.select.i32 -; CHECK: mull %esi -; CHECK-NEXT: cmovol %edi, %esi +; CHECK-LABEL: umulo.select.i32 +; CHECK: mull %esi +; CHECK-NEXT: cmovol %edi, %esi %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 %ret = select i1 %obit, i32 %v1, i32 %v2 @@ -504,9 +517,9 @@ entry: define i64 @umulo.select.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: umulo.select.i64 -; CHECK: mulq %rsi -; CHECK-NEXT: cmovoq %rdi, %rsi +; CHECK-LABEL: umulo.select.i64 +; CHECK: mulq %rsi +; CHECK-NEXT: cmovoq %rdi, %rsi %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 %ret = select i1 %obit, i64 %v1, i64 %v2 @@ -519,9 +532,9 @@ entry: ; define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: saddo.br.i32 -; CHECK: addl %esi, %edi -; CHECK-NEXT: jo +; CHECK-LABEL: saddo.br.i32 +; CHECK: addl %esi, %edi +; CHECK-NEXT: jo %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -536,9 +549,9 @@ continue: define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: saddo.br.i64 -; CHECK: addq %rsi, %rdi -; CHECK-NEXT: jo +; CHECK-LABEL: saddo.br.i64 +; CHECK: addq %rsi, %rdi +; CHECK-NEXT: jo %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -553,9 +566,9 @@ continue: define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: uaddo.br.i32 -; CHECK: addl %esi, %edi -; CHECK-NEXT: jb +; CHECK-LABEL: uaddo.br.i32 +; CHECK: addl %esi, %edi +; CHECK-NEXT: jb %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -570,9 +583,9 @@ continue: define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: uaddo.br.i64 -; CHECK: addq %rsi, %rdi -; CHECK-NEXT: jb +; CHECK-LABEL: uaddo.br.i64 +; CHECK: addq %rsi, %rdi +; CHECK-NEXT: jb %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -587,9 +600,9 @@ continue: define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: ssubo.br.i32 -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: jo +; CHECK-LABEL: ssubo.br.i32 +; CHECK: cmpl %esi, %edi +; CHECK-NEXT: jo %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -604,9 +617,9 @@ continue: define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: ssubo.br.i64 -; CHECK: cmpq %rsi, %rdi -; CHECK-NEXT: jo +; CHECK-LABEL: ssubo.br.i64 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jo %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -621,9 +634,9 @@ continue: define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: usubo.br.i32 -; CHECK: cmpl %esi, %edi -; CHECK-NEXT: jb +; CHECK-LABEL: usubo.br.i32 +; CHECK: cmpl %esi, %edi +; CHECK-NEXT: jb %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -638,9 +651,9 @@ continue: define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: usubo.br.i64 -; CHECK: cmpq %rsi, %rdi -; CHECK-NEXT: jb +; CHECK-LABEL: usubo.br.i64 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jb %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -655,9 +668,9 @@ continue: define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: smulo.br.i32 -; CHECK: imull %esi, %edi -; CHECK-NEXT: jo +; CHECK-LABEL: smulo.br.i32 +; CHECK: imull %esi, %edi +; CHECK-NEXT: jo %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -672,9 +685,9 @@ continue: define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: smulo.br.i64 -; CHECK: imulq %rsi, %rdi -; CHECK-NEXT: jo +; CHECK-LABEL: smulo.br.i64 +; CHECK: imulq %rsi, %rdi +; CHECK-NEXT: jo %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -689,9 +702,9 @@ continue: define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) { entry: -; CHECK-LABEL: umulo.br.i32 -; CHECK: mull %esi -; CHECK-NEXT: jo +; CHECK-LABEL: umulo.br.i32 +; CHECK: mull %esi +; CHECK-NEXT: jo %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 %obit = extractvalue {i32, i1} %t, 1 @@ -706,9 +719,9 @@ continue: define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) { entry: -; CHECK-LABEL: umulo.br.i64 -; CHECK: mulq %rsi -; CHECK-NEXT: jo +; CHECK-LABEL: umulo.br.i64 +; CHECK: mulq %rsi +; CHECK-NEXT: jo %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 %obit = extractvalue {i64, i1} %t, 1 @@ -725,6 +738,8 @@ declare {i8, i1} @llvm.sadd.with.overflow.i8 (i8, i8 ) nounwind readnone declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone +declare {i8, i1} @llvm.uadd.with.overflow.i8 (i8, i8 ) nounwind readnone +declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16) nounwind readnone declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone @@ -740,4 +755,4 @@ declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16) nounwind readnone declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone -!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647} +!0 = !{!"branch_weights", i32 0, i32 2147483647} |