diff options
Diffstat (limited to 'test/CodeGen')
124 files changed, 12311 insertions, 967 deletions
diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll index ff18f7364337..11228c7e8808 100644 --- a/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/test/CodeGen/AArch64/arm64-ccmp.ll @@ -287,3 +287,43 @@ sw.bb.i.i: %code1.i.i.phi.trans.insert = getelementptr inbounds %str1, %str1* %0, i64 0, i32 0, i32 0, i64 16 br label %sw.bb.i.i } + +; CHECK-LABEL: select_and +define i64 @select_and(i32 %v1, i32 %v2, i64 %a, i64 %b) { +; CHECK: cmp +; CHECK: ccmp{{.*}}, #0, ne +; CHECK: csel{{.*}}, lt + %1 = icmp slt i32 %v1, %v2 + %2 = icmp ne i32 5, %v2 + %3 = and i1 %1, %2 + %sel = select i1 %3, i64 %a, i64 %b + ret i64 %sel +} + +; CHECK-LABEL: select_or +define i64 @select_or(i32 %v1, i32 %v2, i64 %a, i64 %b) { +; CHECK: cmp +; CHECK: ccmp{{.*}}, #8, eq +; CHECK: csel{{.*}}, lt + %1 = icmp slt i32 %v1, %v2 + %2 = icmp ne i32 5, %v2 + %3 = or i1 %1, %2 + %sel = select i1 %3, i64 %a, i64 %b + ret i64 %sel +} + +; CHECK-LABEL: select_complicated +define i16 @select_complicated(double %v1, double %v2, i16 %a, i16 %b) { +; CHECK: fcmp +; CHECK: fccmp{{.*}}, #4, ne +; CHECK: fccmp{{.*}}, #1, ne +; CHECK: fccmp{{.*}}, #4, vc +; CEHCK: csel{{.*}}, eq + %1 = fcmp one double %v1, %v2 + %2 = fcmp oeq double %v2, 13.0 + %3 = fcmp oeq double %v1, 42.0 + %or0 = or i1 %2, %3 + %or1 = or i1 %1, %or0 + %sel = select i1 %or1, i16 %a, i16 %b + ret i16 %sel +} diff --git a/test/CodeGen/AArch64/arm64-named-reg-alloc.ll b/test/CodeGen/AArch64/arm64-named-reg-alloc.ll index 0c564544a538..5d48c17e1286 100644 --- a/test/CodeGen/AArch64/arm64-named-reg-alloc.ll +++ b/test/CodeGen/AArch64/arm64-named-reg-alloc.ll @@ -4,7 +4,7 @@ define i32 @get_stack() nounwind { entry: ; FIXME: Include an allocatable-specific error message -; CHECK: Invalid register name global variable +; CHECK: Invalid register name "x5". %sp = call i32 @llvm.read_register.i32(metadata !0) ret i32 %sp } diff --git a/test/CodeGen/AArch64/arm64-named-reg-notareg.ll b/test/CodeGen/AArch64/arm64-named-reg-notareg.ll index 759bc15807b5..8a5fd6f1ac8b 100644 --- a/test/CodeGen/AArch64/arm64-named-reg-notareg.ll +++ b/test/CodeGen/AArch64/arm64-named-reg-notareg.ll @@ -3,7 +3,7 @@ define i32 @get_stack() nounwind { entry: -; CHECK: Invalid register name global variable +; CHECK: Invalid register name "notareg". %sp = call i32 @llvm.read_register.i32(metadata !0) ret i32 %sp } diff --git a/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll b/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll new file mode 100644 index 000000000000..e83cbab140a7 --- /dev/null +++ b/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll @@ -0,0 +1,74 @@ +; RUN: llc -mtriple=aarch64-apple-ios -asm-verbose=false -aarch64-collect-loh=false \ +; RUN: -O1 -global-merge-group-by-use -global-merge-ignore-single-use \ +; RUN: %s -o - | FileCheck %s + +; Check that, at -O1, we only merge globals used in minsize functions. +; We assume that globals of the same size aren't reordered inside a set. +; We use -global-merge-ignore-single-use, and thus only expect one merged set. + +@m1 = internal global i32 0, align 4 +@n1 = internal global i32 0, align 4 + +; CHECK-LABEL: f1: +define void @f1(i32 %a1, i32 %a2) minsize nounwind { +; CHECK-NEXT: adrp x8, [[SET:__MergedGlobals]]@PAGE +; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF +; CHECK-NEXT: stp w0, w1, [x8] +; CHECK-NEXT: ret + store i32 %a1, i32* @m1, align 4 + store i32 %a2, i32* @n1, align 4 + ret void +} + +@m2 = internal global i32 0, align 4 +@n2 = internal global i32 0, align 4 + +; CHECK-LABEL: f2: +define void @f2(i32 %a1, i32 %a2) nounwind { +; CHECK-NEXT: adrp x8, _m2@PAGE +; CHECK-NEXT: adrp x9, _n2@PAGE +; CHECK-NEXT: str w0, [x8, _m2@PAGEOFF] +; CHECK-NEXT: str w1, [x9, _n2@PAGEOFF] +; CHECK-NEXT: ret + store i32 %a1, i32* @m2, align 4 + store i32 %a2, i32* @n2, align 4 + ret void +} + +; If we have use sets partially overlapping between a minsize and a non-minsize +; function, explicitly check that we only consider the globals used in the +; minsize function for merging. + +@m3 = internal global i32 0, align 4 +@n3 = internal global i32 0, align 4 + +; CHECK-LABEL: f3: +define void @f3(i32 %a1, i32 %a2) minsize nounwind { +; CHECK-NEXT: adrp x8, [[SET]]@PAGE +; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF +; CHECK-NEXT: stp w0, w1, [x8, #8] +; CHECK-NEXT: ret + store i32 %a1, i32* @m3, align 4 + store i32 %a2, i32* @n3, align 4 + ret void +} + +@n4 = internal global i32 0, align 4 + +; CHECK-LABEL: f4: +define void @f4(i32 %a1, i32 %a2) nounwind { +; CHECK-NEXT: adrp x8, [[SET]]@PAGE +; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF +; CHECK-NEXT: adrp x9, _n4@PAGE +; CHECK-NEXT: str w0, [x8, #8] +; CHECK-NEXT: str w1, [x9, _n4@PAGEOFF] +; CHECK-NEXT: ret + store i32 %a1, i32* @m3, align 4 + store i32 %a2, i32* @n4, align 4 + ret void +} + +; CHECK-DAG: .zerofill __DATA,__bss,[[SET]],16,3 +; CHECK-DAG: .zerofill __DATA,__bss,_m2,4,2 +; CHECK-DAG: .zerofill __DATA,__bss,_n2,4,2 +; CHECK-DAG: .zerofill __DATA,__bss,_n4,4,2 diff --git a/test/CodeGen/AArch64/minmax.ll b/test/CodeGen/AArch64/minmax.ll index a6b5adebe107..df4912ca1f7a 100644 --- a/test/CodeGen/AArch64/minmax.ll +++ b/test/CodeGen/AArch64/minmax.ll @@ -94,3 +94,14 @@ define <16 x i32> @t11(<16 x i32> %a, <16 x i32> %b) { %t2 = select <16 x i1> %t1, <16 x i32> %a, <16 x i32> %b ret <16 x i32> %t2 } + +; CHECK-LABEL: t12 +; CHECK-NOT: umin +; The icmp is used by two instructions, so don't produce a umin node. +define <16 x i8> @t12(<16 x i8> %a, <16 x i8> %b) { + %t1 = icmp ugt <16 x i8> %b, %a + %t2 = select <16 x i1> %t1, <16 x i8> %a, <16 x i8> %b + %t3 = zext <16 x i1> %t1 to <16 x i8> + %t4 = add <16 x i8> %t3, %t2 + ret <16 x i8> %t4 +} diff --git a/test/CodeGen/AArch64/special-reg.ll b/test/CodeGen/AArch64/special-reg.ll new file mode 100644 index 000000000000..91c32158d420 --- /dev/null +++ b/test/CodeGen/AArch64/special-reg.ll @@ -0,0 +1,48 @@ +; RUN: llc < %s -mtriple=aarch64-none-eabi -mcpu=cortex-a57 2>&1 | FileCheck %s + +define i64 @read_encoded_register() nounwind { +entry: +; CHECK-LABEL: read_encoded_register: +; CHECK: mrs x0, S1_2_C3_C4_5 + %reg = call i64 @llvm.read_register.i64(metadata !0) + ret i64 %reg +} + +define i64 @read_daif() nounwind { +entry: +; CHECK-LABEL: read_daif: +; CHECK: mrs x0, DAIF + %reg = call i64 @llvm.read_register.i64(metadata !1) + ret i64 %reg +} + +define void @write_encoded_register(i64 %x) nounwind { +entry: +; CHECK-LABEL: write_encoded_register: +; CHECK: msr S1_2_C3_C4_5, x0 + call void @llvm.write_register.i64(metadata !0, i64 %x) + ret void +} + +define void @write_daif(i64 %x) nounwind { +entry: +; CHECK-LABEL: write_daif: +; CHECK: msr DAIF, x0 + call void @llvm.write_register.i64(metadata !1, i64 %x) + ret void +} + +define void @write_daifset() nounwind { +entry: +; CHECK-LABEL: write_daifset: +; CHECK: msr DAIFSET, #2 + call void @llvm.write_register.i64(metadata !2, i64 2) + ret void +} + +declare i64 @llvm.read_register.i64(metadata) nounwind +declare void @llvm.write_register.i64(metadata, i64) nounwind + +!0 = !{!"1:2:3:4:5"} +!1 = !{!"daif"} +!2 = !{!"daifset"} diff --git a/test/CodeGen/ARM/atomic-ops-v8.ll b/test/CodeGen/ARM/atomic-ops-v8.ll index db5007b0758d..86287c1178db 100644 --- a/test/CodeGen/ARM/atomic-ops-v8.ll +++ b/test/CodeGen/ARM/atomic-ops-v8.ll @@ -664,7 +664,7 @@ define void @test_atomic_load_min_i64(i64 %offset) nounwind { ; CHECK: movt r[[ADDR]], :upper16:var64 ; CHECK: .LBB{{[0-9]+}}_1: -; CHECK: ldaexd [[OLD1:r[0-9]+]], [[OLD2:r[0-9]+]], [r[[ADDR]]] +; CHECK: ldaexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]] ; r0, r1 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0 @@ -782,7 +782,7 @@ define void @test_atomic_load_max_i64(i64 %offset) nounwind { ; CHECK: movt r[[ADDR]], :upper16:var64 ; CHECK: .LBB{{[0-9]+}}_1: -; CHECK: ldrexd [[OLD1:r[0-9]+]], [[OLD2:r[0-9]+]], [r[[ADDR]]] +; CHECK: ldrexd [[OLD1:r[0-9]+]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]] ; r0, r1 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0 @@ -900,7 +900,7 @@ define void @test_atomic_load_umin_i64(i64 %offset) nounwind { ; CHECK: movt r[[ADDR]], :upper16:var64 ; CHECK: .LBB{{[0-9]+}}_1: -; CHECK: ldaexd [[OLD1:r[0-9]+]], [[OLD2:r[0-9]+]], [r[[ADDR]]] +; CHECK: ldaexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]] ; r0, r1 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0 @@ -1018,7 +1018,7 @@ define void @test_atomic_load_umax_i64(i64 %offset) nounwind { ; CHECK: movt r[[ADDR]], :upper16:var64 ; CHECK: .LBB{{[0-9]+}}_1: -; CHECK: ldaexd [[OLD1:r[0-9]+]], [[OLD2:r[0-9]+]], [r[[ADDR]]] +; CHECK: ldaexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]] ; r0, r1 below is a reasonable guess but could change: it certainly comes into the ; function there. ; CHECK-ARM: mov [[LOCARRY:r[0-9]+|lr]], #0 @@ -1146,10 +1146,12 @@ define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind { ; function there. ; CHECK-LE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0 ; CHECK-LE-DAG: eor{{(\.w)?}} [[MISMATCH_HI:r[0-9]+|lr]], [[OLD2]], r1 -; CHECK-LE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_LO]], [[MISMATCH_HI]] +; CHECK-ARM-LE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_LO]], [[MISMATCH_HI]] +; CHECK-THUMB-LE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_HI]], [[MISMATCH_LO]] ; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_HI:r[0-9]+|lr]], [[OLD2]], r1 ; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0 -; CHECK-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]] +; CHECK-ARM-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]] +; CHECK-THUMB-BE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_LO]], [[MISMATCH_HI]] ; CHECK-NEXT: bne .LBB{{[0-9]+}}_3 ; CHECK-NEXT: BB#2: ; As above, r2, r3 is a reasonable guess. diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll index 1982fa98ef41..e9de52a3e1a0 100644 --- a/test/CodeGen/ARM/build-attributes.ll +++ b/test/CodeGen/ARM/build-attributes.ll @@ -923,7 +923,7 @@ ; CORTEX-M4-SOFT: .eabi_attribute 7, 77 ; CORTEX-M4-SOFT: .eabi_attribute 8, 0 ; CORTEX-M4-SOFT: .eabi_attribute 9, 2 -; CORTEX-M4-SOFT: .fpu vfpv4-d16 +; CORTEX-M4-SOFT: .fpu fpv4-sp-d16 ; CORTEX-M4-SOFT-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-M4-SOFT: .eabi_attribute 20, 1 @@ -953,7 +953,7 @@ ; CORTEX-M4-HARD: .eabi_attribute 7, 77 ; CORTEX-M4-HARD: .eabi_attribute 8, 0 ; CORTEX-M4-HARD: .eabi_attribute 9, 2 -; CORTEX-M4-HARD: .fpu vfpv4-d16 +; CORTEX-M4-HARD: .fpu fpv4-sp-d16 ; CORTEX-M4-HARD-NOT: .eabi_attribute 19 ;; We default to IEEE 754 compliance ; CORTEX-M4-HARD: .eabi_attribute 20, 1 @@ -984,7 +984,7 @@ ; CORTEX-M7: .eabi_attribute 8, 0 ; CORTEX-M7: .eabi_attribute 9, 2 ; CORTEX-M7-SOFT-NOT: .fpu -; CORTEX-M7-SINGLE: .fpu fpv5-d16 +; CORTEX-M7-SINGLE: .fpu fpv5-sp-d16 ; CORTEX-M7-DOUBLE: .fpu fpv5-d16 ; CORTEX-M7: .eabi_attribute 17, 1 ; CORTEX-M7-NOT: .eabi_attribute 19 diff --git a/test/CodeGen/ARM/ifcvt-callback.ll b/test/CodeGen/ARM/ifcvt-callback.ll new file mode 100644 index 000000000000..62a66e745b39 --- /dev/null +++ b/test/CodeGen/ARM/ifcvt-callback.ll @@ -0,0 +1,22 @@ +; RUN: llc -march thumb %s -o - | FileCheck %s + +; This test checks that if-conversion pass is unconditionally added to the pass +; pipeline and is conditionally executed based on the per-function targert-cpu +; attribute. + +; CHECK: ite eq + +define i32 @test_ifcvt(i32 %a, i32 %b) #0 { + %tmp2 = icmp eq i32 %a, 0 + br i1 %tmp2, label %cond_false, label %cond_true + +cond_true: + %tmp5 = add i32 %b, 1 + ret i32 %tmp5 + +cond_false: + %tmp7 = add i32 %b, -1 + ret i32 %tmp7 +} + +attributes #0 = { "target-cpu"="cortex-a8" } diff --git a/test/CodeGen/ARM/jump-table-islands-split.ll b/test/CodeGen/ARM/jump-table-islands-split.ll new file mode 100644 index 000000000000..deba21b4dbb1 --- /dev/null +++ b/test/CodeGen/ARM/jump-table-islands-split.ll @@ -0,0 +1,52 @@ +; RUN: llc -mtriple=thumbv7s-apple-ios8.0 -o - %s | FileCheck %s + +declare void @foo(double) +declare i32 @llvm.arm.space(i32, i32) + +; The constpool entry used to call @foo should be directly between where we want +; the tbb and its table. Fortunately, the flow is simple enough that we can +; eliminate the entry calculation (ADD) and use the ADR as the base. +; +; I'm hoping this won't be fragile, but if it does break the most likely fix is +; adjusting the @llvm.arm.space call slightly. If this happens too many times +; the test should probably be removed. +define i32 @test_jumptable_not_adjacent(i1 %tst, i32 %sw, i32 %l) { +; CHECK-LABEL: test_jumptable_not_adjacent: +; CHECK: vldr {{d[0-9]+}}, [[DBL_CONST:LCPI[0-9]+_[0-9]+]] +; [...] +; CHECK: adr.w r[[BASE:[0-9]+]], [[JUMP_TABLE:LJTI[0-9]+_[0-9]+]] +; CHECK-NOT: r[[BASE]] + +; CHECK: [[TBB_KEY:LCPI[0-9]+_[0-9]+]]: +; CHECK-NEXT: tbb [r[[BASE]], {{r[0-9]+}}] + +; CHECK: [[DBL_CONST]]: +; CHECK: .long +; CHECK: .long +; CHECK: [[JUMP_TABLE]]: +; CHECK: .byte (LBB{{[0-9]+}}_{{[0-9]+}}-([[TBB_KEY]]+4) + + br label %complex + +complex: + call void @foo(double 12345.0) + call i32 @llvm.arm.space(i32 970, i32 undef) + switch i32 %sw, label %second [ i32 0, label %other + i32 1, label %third + i32 2, label %end + i32 3, label %other ] + +second: + ret i32 43 +third: + ret i32 0 + +other: + call void @bar() + unreachable + +end: + ret i32 42 +} + +declare void @bar() diff --git a/test/CodeGen/ARM/jump-table-islands.ll b/test/CodeGen/ARM/jump-table-islands.ll new file mode 100644 index 000000000000..6b4f174c0928 --- /dev/null +++ b/test/CodeGen/ARM/jump-table-islands.ll @@ -0,0 +1,40 @@ +; RUN: llc -mtriple=armv7-apple-ios8.0 -o - %s | FileCheck %s + +%BigInt = type i5500 + +define %BigInt @test_moved_jumptable(i1 %tst, i32 %sw, %BigInt %l) { +; CHECK-LABEL: test_moved_jumptable: + +; CHECK: adr {{r[0-9]+}}, [[JUMP_TABLE:LJTI[0-9]+_[0-9]+]] +; CHECK: b [[SKIP_TABLE:LBB[0-9]+_[0-9]+]] + +; CHECK: [[JUMP_TABLE]]: +; CHECK: .data_region jt32 +; CHECK: .long LBB{{[0-9]+_[0-9]+}}-[[JUMP_TABLE]] + +; CHECK: [[SKIP_TABLE]]: +; CHECK: add pc, {{r[0-9]+}}, {{r[0-9]+}} + br i1 %tst, label %simple, label %complex + +simple: + br label %end + +complex: + switch i32 %sw, label %simple [ i32 0, label %other + i32 1, label %third + i32 5, label %end + i32 6, label %other ] + +third: + ret %BigInt 0 + +other: + call void @bar() + unreachable + +end: + %val = phi %BigInt [ %l, %complex ], [ -1, %simple ] + ret %BigInt %val +} + +declare void @bar() diff --git a/test/CodeGen/ARM/jumptable-label.ll b/test/CodeGen/ARM/jumptable-label.ll index 49d698672f82..2ba90dc97365 100644 --- a/test/CodeGen/ARM/jumptable-label.ll +++ b/test/CodeGen/ARM/jumptable-label.ll @@ -2,8 +2,8 @@ ; test that we print the label of a bb that is only used in a jump table. -; CHECK: .long LBB0_2 -; CHECK: LBB0_2: +; CHECK: .long [[JUMPTABLE_DEST:LBB[0-9]+_[0-9]+]] +; CHECK: [[JUMPTABLE_DEST]]: define i32 @calculate() { entry: diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll index 7ce846844e05..a8070ea68aa2 100644 --- a/test/CodeGen/ARM/ldrd.ll +++ b/test/CodeGen/ARM/ldrd.ll @@ -92,6 +92,22 @@ entry: ret void } +declare void @extfunc(i32, i32, i32, i32) + +; CHECK-LABEL: Func2: +; A8: ldrd +; A8: blx +; A8: pop +define void @Func2(i32* %p) { +entry: + %addr0 = getelementptr i32, i32* %p, i32 0 + %addr1 = getelementptr i32, i32* %p, i32 1 + %v0 = load i32, i32* %addr0 + %v1 = load i32, i32* %addr1 + ; try to force %v0/%v1 into non-adjacent registers + call void @extfunc(i32 %v0, i32 0, i32 0, i32 %v1) + ret void +} declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind diff --git a/test/CodeGen/ARM/named-reg-alloc.ll b/test/CodeGen/ARM/named-reg-alloc.ll index 380cf39734ff..d41fa64882c8 100644 --- a/test/CodeGen/ARM/named-reg-alloc.ll +++ b/test/CodeGen/ARM/named-reg-alloc.ll @@ -4,7 +4,7 @@ define i32 @get_stack() nounwind { entry: ; FIXME: Include an allocatable-specific error message -; CHECK: Invalid register name global variable +; CHECK: Invalid register name "r5". %sp = call i32 @llvm.read_register.i32(metadata !0) ret i32 %sp } diff --git a/test/CodeGen/ARM/named-reg-notareg.ll b/test/CodeGen/ARM/named-reg-notareg.ll index 3ac03f4fdaaa..45cb38f30f35 100644 --- a/test/CodeGen/ARM/named-reg-notareg.ll +++ b/test/CodeGen/ARM/named-reg-notareg.ll @@ -3,7 +3,7 @@ define i32 @get_stack() nounwind { entry: -; CHECK: Invalid register name global variable +; CHECK: Invalid register name "notareg". %sp = call i32 @llvm.read_register.i32(metadata !0) ret i32 %sp } diff --git a/test/CodeGen/ARM/special-reg-acore.ll b/test/CodeGen/ARM/special-reg-acore.ll new file mode 100644 index 000000000000..3d65ff44bfb0 --- /dev/null +++ b/test/CodeGen/ARM/special-reg-acore.ll @@ -0,0 +1,78 @@ +; RUN: llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s --check-prefix=ACORE +; RUN: not llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 2>&1 | FileCheck %s --check-prefix=MCORE + +; MCORE: LLVM ERROR: Invalid register name "cpsr". + +define i32 @read_cpsr() nounwind { + ; ACORE-LABEL: read_cpsr: + ; ACORE: mrs r0, apsr + %reg = call i32 @llvm.read_register.i32(metadata !1) + ret i32 %reg +} + +define i32 @read_aclass_registers() nounwind { +entry: + ; ACORE-LABEL: read_aclass_registers: + ; ACORE: mrs r0, apsr + ; ACORE: mrs r1, spsr + + %0 = call i32 @llvm.read_register.i32(metadata !0) + %1 = call i32 @llvm.read_register.i32(metadata !1) + %add1 = add i32 %1, %0 + %2 = call i32 @llvm.read_register.i32(metadata !2) + %add2 = add i32 %add1, %2 + ret i32 %add2 +} + +define void @write_aclass_registers(i32 %x) nounwind { +entry: + ; ACORE-LABEL: write_aclass_registers: + ; ACORE: msr APSR_nzcvq, r0 + ; ACORE: msr APSR_g, r0 + ; ACORE: msr APSR_nzcvqg, r0 + ; ACORE: msr CPSR_c, r0 + ; ACORE: msr CPSR_x, r0 + ; ACORE: msr APSR_g, r0 + ; ACORE: msr APSR_nzcvq, r0 + ; ACORE: msr CPSR_fsxc, r0 + ; ACORE: msr SPSR_c, r0 + ; ACORE: msr SPSR_x, r0 + ; ACORE: msr SPSR_s, r0 + ; ACORE: msr SPSR_f, r0 + ; ACORE: msr SPSR_fsxc, r0 + + call void @llvm.write_register.i32(metadata !3, i32 %x) + call void @llvm.write_register.i32(metadata !4, i32 %x) + call void @llvm.write_register.i32(metadata !5, i32 %x) + call void @llvm.write_register.i32(metadata !6, i32 %x) + call void @llvm.write_register.i32(metadata !7, i32 %x) + call void @llvm.write_register.i32(metadata !8, i32 %x) + call void @llvm.write_register.i32(metadata !9, i32 %x) + call void @llvm.write_register.i32(metadata !10, i32 %x) + call void @llvm.write_register.i32(metadata !11, i32 %x) + call void @llvm.write_register.i32(metadata !12, i32 %x) + call void @llvm.write_register.i32(metadata !13, i32 %x) + call void @llvm.write_register.i32(metadata !14, i32 %x) + call void @llvm.write_register.i32(metadata !15, i32 %x) + ret void +} + +declare i32 @llvm.read_register.i32(metadata) nounwind +declare void @llvm.write_register.i32(metadata, i32) nounwind + +!0 = !{!"apsr"} +!1 = !{!"cpsr"} +!2 = !{!"spsr"} +!3 = !{!"apsr_nzcvq"} +!4 = !{!"apsr_g"} +!5 = !{!"apsr_nzcvqg"} +!6 = !{!"cpsr_c"} +!7 = !{!"cpsr_x"} +!8 = !{!"cpsr_s"} +!9 = !{!"cpsr_f"} +!10 = !{!"cpsr_cxsf"} +!11 = !{!"spsr_c"} +!12 = !{!"spsr_x"} +!13 = !{!"spsr_s"} +!14 = !{!"spsr_f"} +!15 = !{!"spsr_cxsf"} diff --git a/test/CodeGen/ARM/special-reg-mcore.ll b/test/CodeGen/ARM/special-reg-mcore.ll new file mode 100644 index 000000000000..686da0f6b839 --- /dev/null +++ b/test/CodeGen/ARM/special-reg-mcore.ll @@ -0,0 +1,143 @@ +; RUN: llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 2>&1 | FileCheck %s --check-prefix=MCORE +; RUN: not llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m3 2>&1 | FileCheck %s --check-prefix=M3CORE +; RUN: not llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s --check-prefix=ACORE + +; ACORE: LLVM ERROR: Invalid register name "control". +; M3CORE: LLVM ERROR: Invalid register name "control". + +define i32 @read_mclass_registers() nounwind { +entry: + ; MCORE-LABEL: read_mclass_registers: + ; MCORE: mrs r0, apsr + ; MCORE: mrs r1, iapsr + ; MCORE: mrs r1, eapsr + ; MCORE: mrs r1, xpsr + ; MCORE: mrs r1, ipsr + ; MCORE: mrs r1, epsr + ; MCORE: mrs r1, iepsr + ; MCORE: mrs r1, msp + ; MCORE: mrs r1, psp + ; MCORE: mrs r1, primask + ; MCORE: mrs r1, basepri + ; MCORE: mrs r1, basepri_max + ; MCORE: mrs r1, faultmask + ; MCORE: mrs r1, control + + %0 = call i32 @llvm.read_register.i32(metadata !0) + %1 = call i32 @llvm.read_register.i32(metadata !4) + %add1 = add i32 %1, %0 + %2 = call i32 @llvm.read_register.i32(metadata !8) + %add2 = add i32 %add1, %2 + %3 = call i32 @llvm.read_register.i32(metadata !12) + %add3 = add i32 %add2, %3 + %4 = call i32 @llvm.read_register.i32(metadata !16) + %add4 = add i32 %add3, %4 + %5 = call i32 @llvm.read_register.i32(metadata !17) + %add5 = add i32 %add4, %5 + %6 = call i32 @llvm.read_register.i32(metadata !18) + %add6 = add i32 %add5, %6 + %7 = call i32 @llvm.read_register.i32(metadata !19) + %add7 = add i32 %add6, %7 + %8 = call i32 @llvm.read_register.i32(metadata !20) + %add8 = add i32 %add7, %8 + %9 = call i32 @llvm.read_register.i32(metadata !21) + %add9 = add i32 %add8, %9 + %10 = call i32 @llvm.read_register.i32(metadata !22) + %add10 = add i32 %add9, %10 + %11 = call i32 @llvm.read_register.i32(metadata !23) + %add11 = add i32 %add10, %11 + %12 = call i32 @llvm.read_register.i32(metadata !24) + %add12 = add i32 %add11, %12 + %13 = call i32 @llvm.read_register.i32(metadata !25) + %add13 = add i32 %add12, %13 + ret i32 %add13 +} + +define void @write_mclass_registers(i32 %x) nounwind { +entry: + ; MCORE-LABEL: write_mclass_registers: + ; MCORE: msr apsr_nzcvqg, r0 + ; MCORE: msr apsr_nzcvq, r0 + ; MCORE: msr apsr_g, r0 + ; MCORE: msr apsr_nzcvqg, r0 + ; MCORE: msr iapsr_nzcvqg, r0 + ; MCORE: msr iapsr_nzcvq, r0 + ; MCORE: msr iapsr_g, r0 + ; MCORE: msr iapsr_nzcvqg, r0 + ; MCORE: msr eapsr_nzcvqg, r0 + ; MCORE: msr eapsr_nzcvq, r0 + ; MCORE: msr eapsr_g, r0 + ; MCORE: msr eapsr_nzcvqg, r0 + ; MCORE: msr xpsr_nzcvqg, r0 + ; MCORE: msr xpsr_nzcvq, r0 + ; MCORE: msr xpsr_g, r0 + ; MCORE: msr xpsr_nzcvqg, r0 + ; MCORE: msr ipsr, r0 + ; MCORE: msr epsr, r0 + ; MCORE: msr iepsr, r0 + ; MCORE: msr msp, r0 + ; MCORE: msr psp, r0 + ; MCORE: msr primask, r0 + ; MCORE: msr basepri, r0 + ; MCORE: msr basepri_max, r0 + ; MCORE: msr faultmask, r0 + ; MCORE: msr control, r0 + + call void @llvm.write_register.i32(metadata !0, i32 %x) + call void @llvm.write_register.i32(metadata !1, i32 %x) + call void @llvm.write_register.i32(metadata !2, i32 %x) + call void @llvm.write_register.i32(metadata !3, i32 %x) + call void @llvm.write_register.i32(metadata !4, i32 %x) + call void @llvm.write_register.i32(metadata !5, i32 %x) + call void @llvm.write_register.i32(metadata !6, i32 %x) + call void @llvm.write_register.i32(metadata !7, i32 %x) + call void @llvm.write_register.i32(metadata !8, i32 %x) + call void @llvm.write_register.i32(metadata !9, i32 %x) + call void @llvm.write_register.i32(metadata !10, i32 %x) + call void @llvm.write_register.i32(metadata !11, i32 %x) + call void @llvm.write_register.i32(metadata !12, i32 %x) + call void @llvm.write_register.i32(metadata !13, i32 %x) + call void @llvm.write_register.i32(metadata !14, i32 %x) + call void @llvm.write_register.i32(metadata !15, i32 %x) + call void @llvm.write_register.i32(metadata !16, i32 %x) + call void @llvm.write_register.i32(metadata !17, i32 %x) + call void @llvm.write_register.i32(metadata !18, i32 %x) + call void @llvm.write_register.i32(metadata !19, i32 %x) + call void @llvm.write_register.i32(metadata !20, i32 %x) + call void @llvm.write_register.i32(metadata !21, i32 %x) + call void @llvm.write_register.i32(metadata !22, i32 %x) + call void @llvm.write_register.i32(metadata !23, i32 %x) + call void @llvm.write_register.i32(metadata !24, i32 %x) + call void @llvm.write_register.i32(metadata !25, i32 %x) + ret void +} + +declare i32 @llvm.read_register.i32(metadata) nounwind +declare void @llvm.write_register.i32(metadata, i32) nounwind + +!0 = !{!"apsr"} +!1 = !{!"apsr_nzcvq"} +!2 = !{!"apsr_g"} +!3 = !{!"apsr_nzcvqg"} +!4 = !{!"iapsr"} +!5 = !{!"iapsr_nzcvq"} +!6 = !{!"iapsr_g"} +!7 = !{!"iapsr_nzcvqg"} +!8 = !{!"eapsr"} +!9 = !{!"eapsr_nzcvq"} +!10 = !{!"eapsr_g"} +!11 = !{!"eapsr_nzcvqg"} +!12 = !{!"xpsr"} +!13 = !{!"xpsr_nzcvq"} +!14 = !{!"xpsr_g"} +!15 = !{!"xpsr_nzcvqg"} +!16 = !{!"ipsr"} +!17 = !{!"epsr"} +!18 = !{!"iepsr"} +!19 = !{!"msp"} +!20 = !{!"psp"} +!21 = !{!"primask"} +!22 = !{!"basepri"} +!23 = !{!"basepri_max"} +!24 = !{!"faultmask"} +!25 = !{!"control"} diff --git a/test/CodeGen/ARM/special-reg.ll b/test/CodeGen/ARM/special-reg.ll new file mode 100644 index 000000000000..7ccb490f5d4a --- /dev/null +++ b/test/CodeGen/ARM/special-reg.ll @@ -0,0 +1,78 @@ +; RUN: llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s --check-prefix=ARM --check-prefix=ACORE +; RUN: llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 2>&1 | FileCheck %s --check-prefix=ARM --check-prefix=MCORE + +define i32 @read_i32_encoded_register() nounwind { +entry: +; ARM-LABEL: read_i32_encoded_register: +; ARM: mrc p1, #2, r0, c3, c4, #5 + %reg = call i32 @llvm.read_register.i32(metadata !0) + ret i32 %reg +} + +define i64 @read_i64_encoded_register() nounwind { +entry: +; ARM-LABEL: read_i64_encoded_register: +; ARM: mrrc p1, #2, r0, r1, c3 + %reg = call i64 @llvm.read_register.i64(metadata !1) + ret i64 %reg +} + +define i32 @read_apsr() nounwind { +entry: +; ARM-LABEL: read_apsr: +; ARM: mrs r0, apsr + %reg = call i32 @llvm.read_register.i32(metadata !2) + ret i32 %reg +} + +define i32 @read_fpscr() nounwind { +entry: +; ARM-LABEL: read_fpscr: +; ARM: vmrs r0, fpscr + %reg = call i32 @llvm.read_register.i32(metadata !3) + ret i32 %reg +} + +define void @write_i32_encoded_register(i32 %x) nounwind { +entry: +; ARM-LABEL: write_i32_encoded_register: +; ARM: mcr p1, #2, r0, c3, c4, #5 + call void @llvm.write_register.i32(metadata !0, i32 %x) + ret void +} + +define void @write_i64_encoded_register(i64 %x) nounwind { +entry: +; ARM-LABEL: write_i64_encoded_register: +; ARM: mcrr p1, #2, r0, r1, c3 + call void @llvm.write_register.i64(metadata !1, i64 %x) + ret void +} + +define void @write_apsr(i32 %x) nounwind { +entry: +; ARM-LABEL: write_apsr: +; ACORE: msr APSR_nzcvq, r0 +; MCORE: msr apsr_nzcvq, r0 + call void @llvm.write_register.i32(metadata !4, i32 %x) + ret void +} + +define void @write_fpscr(i32 %x) nounwind { +entry: +; ARM-LABEL: write_fpscr: +; ARM: vmsr fpscr, r0 + call void @llvm.write_register.i32(metadata !3, i32 %x) + ret void +} + +declare i32 @llvm.read_register.i32(metadata) nounwind +declare i64 @llvm.read_register.i64(metadata) nounwind +declare void @llvm.write_register.i32(metadata, i32) nounwind +declare void @llvm.write_register.i64(metadata, i64) nounwind + +!0 = !{!"cp1:2:c3:c4:5"} +!1 = !{!"cp1:2:c3"} +!2 = !{!"apsr"} +!3 = !{!"fpscr"} +!4 = !{!"apsr_nzcvq"} diff --git a/test/CodeGen/BPF/alu8.ll b/test/CodeGen/BPF/alu8.ll index 0233225f81b5..c1c2bd29f247 100644 --- a/test/CodeGen/BPF/alu8.ll +++ b/test/CodeGen/BPF/alu8.ll @@ -1,5 +1,4 @@ -; RUN: llc -march=bpf -show-mc-encoding < %s | FileCheck %s -; test little endian only for now +; RUN: llc -march=bpfel -show-mc-encoding < %s | FileCheck %s define i8 @mov(i8 %a, i8 %b) nounwind { ; CHECK-LABEL: mov: diff --git a/test/CodeGen/BPF/atomics.ll b/test/CodeGen/BPF/atomics.ll index 2f9730ddddef..a2e17d291968 100644 --- a/test/CodeGen/BPF/atomics.ll +++ b/test/CodeGen/BPF/atomics.ll @@ -1,5 +1,4 @@ -; RUN: llc < %s -march=bpf -verify-machineinstrs -show-mc-encoding | FileCheck %s -; test little endian only for now +; RUN: llc < %s -march=bpfel -verify-machineinstrs -show-mc-encoding | FileCheck %s ; CHECK-LABEL: test_load_add_32 ; CHECK: xadd32 diff --git a/test/CodeGen/BPF/basictest.ll b/test/CodeGen/BPF/basictest.ll index 2a2d49878a63..82feb43d005c 100644 --- a/test/CodeGen/BPF/basictest.ll +++ b/test/CodeGen/BPF/basictest.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=bpf | FileCheck %s +; RUN: llc < %s -march=bpfel | FileCheck %s define i32 @test0(i32 %X) { %tmp.1 = add i32 %X, 1 diff --git a/test/CodeGen/BPF/cc_args.ll b/test/CodeGen/BPF/cc_args.ll index 5085fe5684eb..8e3f8604ac88 100644 --- a/test/CodeGen/BPF/cc_args.ll +++ b/test/CodeGen/BPF/cc_args.ll @@ -1,5 +1,4 @@ -; RUN: llc < %s -march=bpf -show-mc-encoding | FileCheck %s -; test little endian only for now +; RUN: llc < %s -march=bpfel -show-mc-encoding | FileCheck %s define void @test() #0 { entry: diff --git a/test/CodeGen/BPF/cc_args_be.ll b/test/CodeGen/BPF/cc_args_be.ll new file mode 100644 index 000000000000..59a7439728ba --- /dev/null +++ b/test/CodeGen/BPF/cc_args_be.ll @@ -0,0 +1,96 @@ +; RUN: llc < %s -march=bpfeb -show-mc-encoding | FileCheck %s +; test big endian + +define void @test() #0 { +entry: +; CHECK: test: + +; CHECK: mov r1, 123 # encoding: [0xb7,0x10,0x00,0x00,0x00,0x00,0x00,0x7b] +; CHECK: call f_i16 + call void @f_i16(i16 123) + +; CHECK: mov r1, 12345678 # encoding: [0xb7,0x10,0x00,0x00,0x00,0xbc,0x61,0x4e] +; CHECK: call f_i32 + call void @f_i32(i32 12345678) + +; CHECK: ld_64 r1, 72623859790382856 # encoding: [0x18,0x10,0x00,0x00,0x05,0x06,0x07,0x08,0x00,0x00,0x00,0x00,0x01,0x02,0x03,0x04] +; CHECK: call f_i64 + call void @f_i64(i64 72623859790382856) + +; CHECK: mov r1, 1234 +; CHECK: mov r2, 5678 +; CHECK: call f_i32_i32 + call void @f_i32_i32(i32 1234, i32 5678) + +; CHECK: mov r1, 2 +; CHECK: mov r2, 3 +; CHECK: mov r3, 4 +; CHECK: call f_i16_i32_i16 + call void @f_i16_i32_i16(i16 2, i32 3, i16 4) + +; CHECK: mov r1, 5 +; CHECK: ld_64 r2, 7262385979038285 +; CHECK: mov r3, 6 +; CHECK: call f_i16_i64_i16 + call void @f_i16_i64_i16(i16 5, i64 7262385979038285, i16 6) + + ret void +} + +@g_i16 = common global i16 0, align 2 +@g_i32 = common global i32 0, align 2 +@g_i64 = common global i64 0, align 4 + +define void @f_i16(i16 %a) #0 { +; CHECK: f_i16: +; CHECK: sth 0(r2), r1 # encoding: [0x6b,0x21,0x00,0x00,0x00,0x00,0x00,0x00] + store volatile i16 %a, i16* @g_i16, align 2 + ret void +} + +define void @f_i32(i32 %a) #0 { +; CHECK: f_i32: +; CHECK: sth 2(r2), r1 # encoding: [0x6b,0x21,0x00,0x02,0x00,0x00,0x00,0x00] +; CHECK: sth 0(r2), r1 # encoding: [0x6b,0x21,0x00,0x00,0x00,0x00,0x00,0x00] + store volatile i32 %a, i32* @g_i32, align 2 + ret void +} + +define void @f_i64(i64 %a) #0 { +; CHECK: f_i64: +; CHECK: stw 4(r2), r1 # encoding: [0x63,0x21,0x00,0x04,0x00,0x00,0x00,0x00] +; CHECK: stw 0(r2), r1 + store volatile i64 %a, i64* @g_i64, align 2 + ret void +} + +define void @f_i32_i32(i32 %a, i32 %b) #0 { +; CHECK: f_i32_i32: +; CHECK: stw 0(r3), r1 + store volatile i32 %a, i32* @g_i32, align 4 +; CHECK: stw 0(r3), r2 + store volatile i32 %b, i32* @g_i32, align 4 + ret void +} + +define void @f_i16_i32_i16(i16 %a, i32 %b, i16 %c) #0 { +; CHECK: f_i16_i32_i16: +; CHECK: sth 0(r4), r1 + store volatile i16 %a, i16* @g_i16, align 2 +; CHECK: stw 0(r1), r2 + store volatile i32 %b, i32* @g_i32, align 4 +; CHECK: sth 0(r4), r3 + store volatile i16 %c, i16* @g_i16, align 2 + ret void +} + +define void @f_i16_i64_i16(i16 %a, i64 %b, i16 %c) #0 { +; CHECK: f_i16_i64_i16: +; CHECK: sth 0(r4), r1 + store volatile i16 %a, i16* @g_i16, align 2 +; CHECK: std 0(r1), r2 # encoding: [0x7b,0x12,0x00,0x00,0x00,0x00,0x00,0x00] + store volatile i64 %b, i64* @g_i64, align 8 +; CHECK: sth 0(r4), r3 + store volatile i16 %c, i16* @g_i16, align 2 + ret void +} diff --git a/test/CodeGen/BPF/cc_ret.ll b/test/CodeGen/BPF/cc_ret.ll index e32b17bcc61c..09574922f325 100644 --- a/test/CodeGen/BPF/cc_ret.ll +++ b/test/CodeGen/BPF/cc_ret.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=bpf | FileCheck %s +; RUN: llc < %s -march=bpfel | FileCheck %s define void @test() #0 { entry: diff --git a/test/CodeGen/BPF/ex1.ll b/test/CodeGen/BPF/ex1.ll index be038e9a3d8c..546e5d49da69 100644 --- a/test/CodeGen/BPF/ex1.ll +++ b/test/CodeGen/BPF/ex1.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=bpf | FileCheck %s +; RUN: llc < %s -march=bpfel | FileCheck %s %struct.bpf_context = type { i64, i64, i64, i64, i64, i64, i64 } %struct.sk_buff = type { i64, i64, i64, i64, i64, i64, i64 } diff --git a/test/CodeGen/BPF/intrinsics.ll b/test/CodeGen/BPF/intrinsics.ll index 98b57deb7c8d..483473e922fc 100644 --- a/test/CodeGen/BPF/intrinsics.ll +++ b/test/CodeGen/BPF/intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=bpf -show-mc-encoding | FileCheck %s +; RUN: llc < %s -march=bpfel -show-mc-encoding | FileCheck %s ; Function Attrs: nounwind uwtable define i32 @ld_b(i64 %foo, i64* nocapture %bar, i8* %ctx, i8* %ctx2) #0 { diff --git a/test/CodeGen/BPF/load.ll b/test/CodeGen/BPF/load.ll index 03fb17c965b5..d4ba315b5f18 100644 --- a/test/CodeGen/BPF/load.ll +++ b/test/CodeGen/BPF/load.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=bpf | FileCheck %s +; RUN: llc < %s -march=bpfel | FileCheck %s define i16 @am1(i16* %a) nounwind { %1 = load i16, i16* %a diff --git a/test/CodeGen/BPF/loops.ll b/test/CodeGen/BPF/loops.ll index 4798d78842ca..00be54b3bac5 100644 --- a/test/CodeGen/BPF/loops.ll +++ b/test/CodeGen/BPF/loops.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=bpf | FileCheck %s +; RUN: llc < %s -march=bpfel | FileCheck %s define zeroext i16 @add(i16* nocapture %a, i16 zeroext %n) nounwind readonly { entry: diff --git a/test/CodeGen/BPF/sanity.ll b/test/CodeGen/BPF/sanity.ll index 09a6b65d0854..7f0ef889ff99 100644 --- a/test/CodeGen/BPF/sanity.ll +++ b/test/CodeGen/BPF/sanity.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=bpf | FileCheck %s +; RUN: llc < %s -march=bpfel | FileCheck %s @foo_printf.fmt = private unnamed_addr constant [9 x i8] c"hello \0A\00", align 1 diff --git a/test/CodeGen/BPF/setcc.ll b/test/CodeGen/BPF/setcc.ll index eabb6c9bf2d6..f6c6db6c6836 100644 --- a/test/CodeGen/BPF/setcc.ll +++ b/test/CodeGen/BPF/setcc.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=bpf < %s | FileCheck %s +; RUN: llc -march=bpfel < %s | FileCheck %s define i16 @sccweqand(i16 %a, i16 %b) nounwind { %t1 = and i16 %a, %b diff --git a/test/CodeGen/BPF/shifts.ll b/test/CodeGen/BPF/shifts.ll index 898ae2d46123..cb000b92fcd9 100644 --- a/test/CodeGen/BPF/shifts.ll +++ b/test/CodeGen/BPF/shifts.ll @@ -1,5 +1,4 @@ -; RUN: llc < %s -march=bpf -show-mc-encoding | FileCheck %s -; test little endian only for now +; RUN: llc < %s -march=bpfel -show-mc-encoding | FileCheck %s define zeroext i8 @lshr8(i8 zeroext %a, i8 zeroext %cnt) nounwind readnone { entry: diff --git a/test/CodeGen/BPF/sockex2.ll b/test/CodeGen/BPF/sockex2.ll index 6ae5e1c8d6bf..d372a5982f68 100644 --- a/test/CodeGen/BPF/sockex2.ll +++ b/test/CodeGen/BPF/sockex2.ll @@ -1,5 +1,4 @@ -; RUN: llc < %s -march=bpf -show-mc-encoding | FileCheck %s -; test little endian only for now +; RUN: llc < %s -march=bpfel -show-mc-encoding | FileCheck %s %struct.bpf_map_def = type { i32, i32, i32, i32 } %struct.sk_buff = type opaque diff --git a/test/CodeGen/Generic/stop-after.ll b/test/CodeGen/Generic/stop-after.ll index 557e097840af..791378c3737d 100644 --- a/test/CodeGen/Generic/stop-after.ll +++ b/test/CodeGen/Generic/stop-after.ll @@ -1,9 +1,10 @@ ; RUN: llc < %s -debug-pass=Structure -stop-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START -; STOP: -loop-reduce -print-module +; STOP: -loop-reduce ; STOP: Loop Strength Reduction ; STOP-NEXT: Machine Function Analysis +; STOP-NEXT: MIR Printing Pass ; START: -machine-branch-prob -gc-lowering ; START: FunctionPass Manager diff --git a/test/CodeGen/Hexagon/args.ll b/test/CodeGen/Hexagon/args.ll index aea4ffe2eee5..1c470f68aa27 100644 --- a/test/CodeGen/Hexagon/args.ll +++ b/test/CodeGen/Hexagon/args.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched -disable-hexagon-misched < %s | FileCheck %s -; CHECK: memw(r29{{ *}}+{{ *}}#0){{ *}}={{ *}}#7 -; CHECK: r1:0 = combine(#2, #1) -; CHECK: r3:2 = combine(#4, #3) +; RUN: llc -march=hexagon < %s | FileCheck %s ; CHECK: r5:4 = combine(#6, #5) +; CHECK: r3:2 = combine(#4, #3) +; CHECK: r1:0 = combine(#2, #1) +; CHECK: memw(r29{{ *}}+{{ *}}#0){{ *}}={{ *}}#7 define void @foo() nounwind { diff --git a/test/CodeGen/Hexagon/calling-conv.ll b/test/CodeGen/Hexagon/calling-conv.ll deleted file mode 100644 index 7133c1ae7aad..000000000000 --- a/test/CodeGen/Hexagon/calling-conv.ll +++ /dev/null @@ -1,73 +0,0 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 <%s | \ -; RUN: FileCheck %s --check-prefix=CHECK-ONE -; RUN: llc -march=hexagon -mcpu=hexagonv5 <%s | \ -; RUN: FileCheck %s --check-prefix=CHECK-TWO -; RUN: llc -march=hexagon -mcpu=hexagonv5 <%s | \ -; RUN: FileCheck %s --check-prefix=CHECK-THREE - -%struct.test_struct = type { i32, i8, i64 } -%struct.test_struct_long = type { i8, i64 } - -@mystruct = external global %struct.test_struct*, align 4 - -; CHECK-ONE: memw(r29+#48) = r2 -; CHECK-TWO: memw(r29+#52) = r2 -; CHECK-THREE: memw(r29+#56) = r2 -; Function Attrs: nounwind -define void @foo(%struct.test_struct* noalias sret %agg.result, i32 %a, i8 zeroext %c, %struct.test_struct* byval %s, %struct.test_struct_long* byval %t) #0 { -entry: - %a.addr = alloca i32, align 4 - %c.addr = alloca i8, align 1 - %z = alloca i32, align 4 - %ret = alloca %struct.test_struct, align 8 - store i32 %a, i32* %a.addr, align 4 - store i8 %c, i8* %c.addr, align 1 - %0 = bitcast i32* %z to i8* - call void @llvm.lifetime.start(i64 4, i8* %0) #1 - store i32 45, i32* %z, align 4 - %1 = bitcast %struct.test_struct* %ret to i8* - call void @llvm.lifetime.start(i64 16, i8* %1) #1 - %2 = load i32, i32* %z, align 4 - %3 = load %struct.test_struct*, %struct.test_struct** @mystruct, align 4 - %4 = load %struct.test_struct*, %struct.test_struct** @mystruct, align 4 - %5 = load i8, i8* %c.addr, align 1 - %6 = load i32, i32* %a.addr, align 4 - %conv = sext i32 %6 to i64 - %add = add nsw i64 %conv, 1 - %7 = load i32, i32* %a.addr, align 4 - %add1 = add nsw i32 %7, 2 - %8 = load i32, i32* %a.addr, align 4 - %conv2 = sext i32 %8 to i64 - %add3 = add nsw i64 %conv2, 3 - %9 = load i8, i8* %c.addr, align 1 - %10 = load i8, i8* %c.addr, align 1 - %11 = load i8, i8* %c.addr, align 1 - %12 = load i32, i32* %z, align 4 - call void @bar(%struct.test_struct* sret %ret, i32 %2, %struct.test_struct* byval %3, %struct.test_struct* byval %4, i8 zeroext %5, i64 %add, i32 %add1, i64 %add3, i8 zeroext %9, i8 zeroext %10, i8 zeroext %11, i32 %12) - %x = getelementptr inbounds %struct.test_struct, %struct.test_struct* %ret, i32 0, i32 0 - store i32 20, i32* %x, align 4 - %13 = bitcast %struct.test_struct* %agg.result to i8* - %14 = bitcast %struct.test_struct* %ret to i8* - call void @llvm.memcpy.p0i8.p0i8.i32(i8* %13, i8* %14, i32 16, i32 8, i1 false) - %15 = bitcast %struct.test_struct* %ret to i8* - call void @llvm.lifetime.end(i64 16, i8* %15) #1 - %16 = bitcast i32* %z to i8* - call void @llvm.lifetime.end(i64 4, i8* %16) #1 - ret void -} - -; Function Attrs: nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) #1 - -declare void @bar(%struct.test_struct* sret, i32, %struct.test_struct* byval, %struct.test_struct* byval, i8 zeroext, i64, i32, i64, i8 zeroext, i8 zeroext, i8 zeroext, i32) #2 - -; Function Attrs: nounwind -declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1 - -; Function Attrs: nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) #1 - -attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv4" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { nounwind } -attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv4" "unsafe-fp-math"="false" "use-soft-float"="false" } - diff --git a/test/CodeGen/Hexagon/cext-valid-packet1.ll b/test/CodeGen/Hexagon/cext-valid-packet1.ll index a479d37e4ae5..35e7b364b508 100644 --- a/test/CodeGen/Hexagon/cext-valid-packet1.ll +++ b/test/CodeGen/Hexagon/cext-valid-packet1.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s +; XFAIL: ; Check that the packetizer generates valid packets with constant ; extended instructions. diff --git a/test/CodeGen/Hexagon/cext-valid-packet2.ll b/test/CodeGen/Hexagon/cext-valid-packet2.ll index 2eba74329960..c3a4915ec2e0 100644 --- a/test/CodeGen/Hexagon/cext-valid-packet2.ll +++ b/test/CodeGen/Hexagon/cext-valid-packet2.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s +; XFAIL: ; Check that the packetizer generates valid packets with constant ; extended add and base+offset store instructions. diff --git a/test/CodeGen/Hexagon/compound.ll b/test/CodeGen/Hexagon/compound.ll new file mode 100644 index 000000000000..f8d36b8b77d9 --- /dev/null +++ b/test/CodeGen/Hexagon/compound.ll @@ -0,0 +1,17 @@ +; RUN: llc -march=hexagon -filetype=obj -o - %s | llvm-objdump -d - | FileCheck %s + +; CHECK: p0 = cmp.gt(r0,#-1); if (!p0.new) jump:nt + +declare void @a() +declare void @b() + +define void @foo(i32 %a) { +%b = icmp sgt i32 %a, -1 +br i1 %b, label %x, label %y +x: +call void @a() +ret void +y: +call void @b() +ret void +}
\ No newline at end of file diff --git a/test/CodeGen/Hexagon/dualstore.ll b/test/CodeGen/Hexagon/dualstore.ll index 33d9ce9b9351..9f4569d6459c 100644 --- a/test/CodeGen/Hexagon/dualstore.ll +++ b/test/CodeGen/Hexagon/dualstore.ll @@ -1,12 +1,11 @@ -; RUN: llc -march=hexagon -disable-hexagon-misched < %s | FileCheck %s +; RUN: llc -march=hexagon -filetype=obj %s -o - | llvm-objdump -d - | FileCheck %s ; Check that we generate dual stores in one packet in V4 -; CHECK: memw(r{{[0-9]+}}{{ *}}+{{ *}}#{{[0-9]+}}){{ *}}= -; CHECK-NEXT: memw(r{{[0-9]+}}{{ *}}+{{ *}}#{{[0-9]+}}){{ *}}= +; CHECK: 00 40 9f 52 529f4000 +; CHECK: 10 10 00 f0 f0001010 -define i32 @main(i32 %v, i32* %p1, i32* %p2) nounwind { -entry: - store i32 %v, i32* %p1, align 4 - store i32 %v, i32* %p2, align 4 - ret i32 0 +define void @foo(i32* %a, i32* %b) { + store i32 0, i32* %a + store i32 0, i32* %b + ret void } diff --git a/test/CodeGen/Hexagon/duplex.ll b/test/CodeGen/Hexagon/duplex.ll new file mode 100644 index 000000000000..80fe61ceccca --- /dev/null +++ b/test/CodeGen/Hexagon/duplex.ll @@ -0,0 +1,7 @@ +; RUN: llc -march=hexagon -filetype=obj -o - %s | llvm-objdump -d - | FileCheck %s + +; CHECK: c0 3f 00 48 48003fc0 + +define i32 @foo() { +ret i32 0 +}
\ No newline at end of file diff --git a/test/CodeGen/Hexagon/relax.ll b/test/CodeGen/Hexagon/relax.ll new file mode 100644 index 000000000000..9823d4d1cd9c --- /dev/null +++ b/test/CodeGen/Hexagon/relax.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=hexagon -filetype=obj < %s | llvm-objdump -d -r - | FileCheck %s + +declare void @bar() + +define void @foo() { +call void @bar() +ret void +} + + +; CHECK: { allocframe(#0) } +; CHECK: { call 0 } +; CHECK: 00000004: R_HEX_B22_PCREL +; CHECK: { dealloc_return }
\ No newline at end of file diff --git a/test/CodeGen/Hexagon/sube.ll b/test/CodeGen/Hexagon/sube.ll index 873f52b2d5df..9735894c419e 100644 --- a/test/CodeGen/Hexagon/sube.ll +++ b/test/CodeGen/Hexagon/sube.ll @@ -3,10 +3,10 @@ ; CHECK: r{{[0-9]+:[0-9]+}} = #1 ; CHECK: r{{[0-9]+:[0-9]+}} = #0 ; CHECK: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}}) -; CHECK: r{{[0-9]+}} = mux(p{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}) -; CHECK: r{{[0-9]+}} = mux(p{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}) ; CHECK: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}}) ; CHECK: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}}) +; CHECK: r{{[0-9]+}} = mux(p{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}) +; CHECK: r{{[0-9]+}} = mux(p{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}) ; CHECK: r{{[0-9]+:[0-9]+}} = combine(r{{[0-9]+}}, r{{[0-9]+}}) define void @check_sube_subc(i64 %AL, i64 %AH, i64 %BL, i64 %BH, i64* %RL, i64* %RH) { diff --git a/test/CodeGen/MIR/lit.local.cfg b/test/CodeGen/MIR/lit.local.cfg new file mode 100644 index 000000000000..e69aa5765356 --- /dev/null +++ b/test/CodeGen/MIR/lit.local.cfg @@ -0,0 +1,2 @@ +config.suffixes = ['.mir'] + diff --git a/test/CodeGen/MIR/llvm-ir-error-reported.mir b/test/CodeGen/MIR/llvm-ir-error-reported.mir new file mode 100644 index 000000000000..013b28cd7890 --- /dev/null +++ b/test/CodeGen/MIR/llvm-ir-error-reported.mir @@ -0,0 +1,22 @@ +# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s +# This test ensures an error is reported if the embedded LLVM IR contains an +# error. + +--- | + + ; CHECK: [[@LINE+3]]:15: error: use of undefined value '%a' + define i32 @foo(i32 %x, i32 %y) { + %z = alloca i32, align 4 + store i32 %a, i32* %z, align 4 + br label %Test + Test: + %m = load i32, i32* %z, align 4 + %cond = icmp eq i32 %y, %m + br i1 %cond, label %IfEqual, label %IfUnequal + IfEqual: + ret i32 1 + IfUnequal: + ret i32 0 + } + +... diff --git a/test/CodeGen/MIR/llvmIR.mir b/test/CodeGen/MIR/llvmIR.mir new file mode 100644 index 000000000000..7a7b46b62638 --- /dev/null +++ b/test/CodeGen/MIR/llvmIR.mir @@ -0,0 +1,32 @@ +# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s +# This test ensures that the LLVM IR that's embedded with MIR is parsed +# correctly. + +--- | + ; CHECK: define i32 @foo(i32 %x, i32 %y) + ; CHECK: %z = alloca i32, align 4 + ; CHECK: store i32 %x, i32* %z, align 4 + ; CHECK: br label %Test + ; CHECK: Test: + ; CHECK: %m = load i32, i32* %z, align 4 + ; CHECK: %cond = icmp eq i32 %y, %m + ; CHECK: br i1 %cond, label %IfEqual, label %IfUnequal + ; CHECK: IfEqual: + ; CHECK: ret i32 1 + ; CHECK: IfUnequal: + ; CHECK: ret i32 0 + define i32 @foo(i32 %x, i32 %y) { + %z = alloca i32, align 4 + store i32 %x, i32* %z, align 4 + br label %Test + Test: + %m = load i32, i32* %z, align 4 + %cond = icmp eq i32 %y, %m + br i1 %cond, label %IfEqual, label %IfUnequal + IfEqual: + ret i32 1 + IfUnequal: + ret i32 0 + } + +... diff --git a/test/CodeGen/MIR/llvmIRMissing.mir b/test/CodeGen/MIR/llvmIRMissing.mir new file mode 100644 index 000000000000..2acbcd1f9884 --- /dev/null +++ b/test/CodeGen/MIR/llvmIRMissing.mir @@ -0,0 +1,5 @@ +# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s +# This test ensures that the MIR parser accepts files without the LLVM IR. + +--- +... diff --git a/test/CodeGen/MIR/machine-function-missing-name.mir b/test/CodeGen/MIR/machine-function-missing-name.mir new file mode 100644 index 000000000000..54668f1a5efe --- /dev/null +++ b/test/CodeGen/MIR/machine-function-missing-name.mir @@ -0,0 +1,22 @@ +# RUN: not llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s 2>&1 | FileCheck %s +# This test ensures that an error is reported when a machine function doesn't +# have a name attribute. + +--- | + + define i32 @foo() { + ret i32 0 + } + + define i32 @bar() { + ret i32 0 + } + +... +--- +# CHECK: [[@LINE+1]]:1: error: missing required key 'name' +nme: foo +... +--- +name: bar +... diff --git a/test/CodeGen/MIR/machine-function.mir b/test/CodeGen/MIR/machine-function.mir new file mode 100644 index 000000000000..679bfd2d1620 --- /dev/null +++ b/test/CodeGen/MIR/machine-function.mir @@ -0,0 +1,24 @@ +# RUN: llc -start-after branch-folder -stop-after branch-folder -o /dev/null %s | FileCheck %s +# This test ensures that the MIR parser parses machine functions correctly. + +--- | + + define i32 @foo() { + ret i32 0 + } + + define i32 @bar() { + ret i32 0 + } + +... +--- +# CHECK: name: foo +# CHECK-NEXT: ... +name: foo +... +--- +# CHECK: name: bar +# CHECK-NEXT: ... +name: bar +... diff --git a/test/CodeGen/Mips/Fast-ISel/bswap1.ll b/test/CodeGen/Mips/Fast-ISel/bswap1.ll new file mode 100644 index 000000000000..8ac9753fa463 --- /dev/null +++ b/test/CodeGen/Mips/Fast-ISel/bswap1.ll @@ -0,0 +1,58 @@ +; RUN: llc < %s -march=mipsel -mcpu=mips32 -O0 -relocation-model=pic \ +; RUN: -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s \ +; RUN: -check-prefix=ALL -check-prefix=32R1 +; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 -relocation-model=pic \ +; RUN: -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s \ +; RUN: -check-prefix=ALL -check-prefix=32R2 + +@a = global i16 -21829, align 2 +@b = global i32 -1430532899, align 4 +@a1 = common global i16 0, align 2 +@b1 = common global i32 0, align 4 + +declare i16 @llvm.bswap.i16(i16) +declare i32 @llvm.bswap.i32(i32) + +define void @b16() { + ; ALL-LABEL: b16: + + ; ALL: lw $[[A_ADDR:[0-9]+]], %got(a)($[[GOT_ADDR:[0-9]+]]) + ; ALL: lhu $[[A_VAL:[0-9]+]], 0($[[A_ADDR]]) + + ; 32R1: sll $[[TMP1:[0-9]+]], $[[A_VAL]], 8 + ; 32R1: srl $[[TMP2:[0-9]+]], $[[A_VAL]], 8 + ; 32R1: or $[[TMP3:[0-9]+]], $[[TMP1]], $[[TMP2]] + ; 32R1: andi $[[TMP4:[0-9]+]], $[[TMP3]], 65535 + + ; 32R2: wsbh $[[RESULT:[0-9]+]], $[[A_VAL]] + + %1 = load i16, i16* @a, align 2 + %2 = call i16 @llvm.bswap.i16(i16 %1) + store i16 %2, i16* @a1, align 2 + ret void +} + +define void @b32() { + ; ALL-LABEL: b32: + + ; ALL: lw $[[B_ADDR:[0-9]+]], %got(b)($[[GOT_ADDR:[0-9]+]]) + ; ALL: lw $[[B_VAL:[0-9]+]], 0($[[B_ADDR]]) + + ; 32R1: srl $[[TMP1:[0-9]+]], $[[B_VAL]], 8 + ; 32R1: srl $[[TMP2:[0-9]+]], $[[B_VAL]], 24 + ; 32R1: andi $[[TMP3:[0-9]+]], $[[TMP1]], 65280 + ; 32R1: or $[[TMP4:[0-9]+]], $[[TMP2]], $[[TMP3]] + ; 32R1: andi $[[TMP5:[0-9]+]], $[[B_VAL]], 65280 + ; 32R1: sll $[[TMP6:[0-9]+]], $[[TMP5]], 8 + ; 32R1: sll $[[TMP7:[0-9]+]], $[[B_VAL]], 24 + ; 32R1: or $[[TMP8:[0-9]+]], $[[TMP4]], $[[TMP6]] + ; 32R1: or $[[RESULT:[0-9]+]], $[[TMP7]], $[[TMP8]] + + ; 32R2: wsbh $[[TMP:[0-9]+]], $[[B_VAL]] + ; 32R2: rotr $[[RESULT:[0-9]+]], $[[TMP]], 16 + + %1 = load i32, i32* @b, align 4 + %2 = call i32 @llvm.bswap.i32(i32 %1) + store i32 %2, i32* @b1, align 4 + ret void +} diff --git a/test/CodeGen/Mips/Fast-ISel/div1.ll b/test/CodeGen/Mips/Fast-ISel/div1.ll new file mode 100644 index 000000000000..89e7f211251f --- /dev/null +++ b/test/CodeGen/Mips/Fast-ISel/div1.ll @@ -0,0 +1,55 @@ +; RUN: llc < %s -march=mipsel -mcpu=mips32 -O0 -relocation-model=pic \ +; RUN: -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s +; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 -relocation-model=pic \ +; RUN: -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s + +@sj = global i32 200000, align 4 +@sk = global i32 -47, align 4 +@uj = global i32 200000, align 4 +@uk = global i32 43, align 4 +@si = common global i32 0, align 4 +@ui = common global i32 0, align 4 + +define void @divs() { + ; CHECK-LABEL: divs: + + ; CHECK: lui $[[GOT1:[0-9]+]], %hi(_gp_disp) + ; CHECK: addiu $[[GOT2:[0-9]+]], $[[GOT1]], %lo(_gp_disp) + ; CHECK: addu $[[GOT:[0-9]+]], $[[GOT2:[0-9]+]], $25 + ; CHECK-DAG: lw $[[I_ADDR:[0-9]+]], %got(si)($[[GOT]]) + ; CHECK-DAG: lw $[[K_ADDR:[0-9]+]], %got(sk)($[[GOT]]) + ; CHECK-DAG: lw $[[J_ADDR:[0-9]+]], %got(sj)($[[GOT]]) + ; CHECK-DAG: lw $[[J:[0-9]+]], 0($[[J_ADDR]]) + ; CHECK-DAG: lw $[[K:[0-9]+]], 0($[[K_ADDR]]) + ; CHECK-DAG: div $zero, $[[J]], $[[K]] + ; CHECK_DAG: teq $[[K]], $zero, 7 + ; CHECK-DAG: mflo $[[RESULT:[0-9]+]] + ; CHECK: sw $[[RESULT]], 0($[[I_ADDR]]) + %1 = load i32, i32* @sj, align 4 + %2 = load i32, i32* @sk, align 4 + %div = sdiv i32 %1, %2 + store i32 %div, i32* @si, align 4 + ret void +} + +define void @divu() { + ; CHECK-LABEL: divu: + + ; CHECK: lui $[[GOT1:[0-9]+]], %hi(_gp_disp) + ; CHECK: addiu $[[GOT2:[0-9]+]], $[[GOT1]], %lo(_gp_disp) + ; CHECK: addu $[[GOT:[0-9]+]], $[[GOT2:[0-9]+]], $25 + ; CHECK-DAG: lw $[[I_ADDR:[0-9]+]], %got(ui)($[[GOT]]) + ; CHECK-DAG: lw $[[K_ADDR:[0-9]+]], %got(uk)($[[GOT]]) + ; CHECK-DAG: lw $[[J_ADDR:[0-9]+]], %got(uj)($[[GOT]]) + ; CHECK-DAG: lw $[[J:[0-9]+]], 0($[[J_ADDR]]) + ; CHECK-DAG: lw $[[K:[0-9]+]], 0($[[K_ADDR]]) + ; CHECK-DAG: divu $zero, $[[J]], $[[K]] + ; CHECK_DAG: teq $[[K]], $zero, 7 + ; CHECK-DAG: mflo $[[RESULT:[0-9]+]] + ; CHECK: sw $[[RESULT]], 0($[[I_ADDR]]) + %1 = load i32, i32* @uj, align 4 + %2 = load i32, i32* @uk, align 4 + %div = udiv i32 %1, %2 + store i32 %div, i32* @ui, align 4 + ret void +} diff --git a/test/CodeGen/Mips/Fast-ISel/memtest1.ll b/test/CodeGen/Mips/Fast-ISel/memtest1.ll new file mode 100644 index 000000000000..a3fc4a32981c --- /dev/null +++ b/test/CodeGen/Mips/Fast-ISel/memtest1.ll @@ -0,0 +1,74 @@ +; RUN: llc < %s -march=mipsel -mcpu=mips32 -O0 -relocation-model=pic \ +; RUN: -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s \ +; RUN: -check-prefix=ALL -check-prefix=32R1 +; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 -relocation-model=pic \ +; RUN: -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s \ +; RUN: -check-prefix=ALL -check-prefix=32R2 + +@str = private unnamed_addr constant [12 x i8] c"hello there\00", align 1 +@src = global i8* getelementptr inbounds ([12 x i8], [12 x i8]* @str, i32 0, i32 0), align 4 +@i = global i32 12, align 4 +@dest = common global [50 x i8] zeroinitializer, align 1 + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) +declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) +declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) + +define void @cpy(i8* %src, i32 %i) { + ; ALL-LABEL: cpy: + + ; ALL-DAG: lw $[[T0:[0-9]+]], %got(dest)(${{[0-9]+}}) + ; ALL-DAG: sw $4, 24($sp) + ; ALL-DAG: move $4, $[[T0]] + ; ALL-DAG: sw $5, 20($sp) + ; ALL-DAG: lw $[[T1:[0-9]+]], 24($sp) + ; ALL-DAG: move $5, $[[T1]] + ; ALL-DAG: lw $6, 20($sp) + ; ALL-DAG: lw $[[T2:[0-9]+]], %got(memcpy)(${{[0-9]+}}) + ; ALL: jalr $[[T2]] + ; ALL-NEXT: nop + ; ALL-NOT: {{.*}}$2{{.*}} + call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds ([50 x i8], [50 x i8]* @dest, i32 0, i32 0), + i8* %src, i32 %i, i32 1, i1 false) + ret void +} + +define void @mov(i8* %src, i32 %i) { + ; ALL-LABEL: mov: + + + ; ALL-DAG: lw $[[T0:[0-9]+]], %got(dest)(${{[0-9]+}}) + ; ALL-DAG: sw $4, 24($sp) + ; ALL-DAG: move $4, $[[T0]] + ; ALL-DAG: sw $5, 20($sp) + ; ALL-DAG: lw $[[T1:[0-9]+]], 24($sp) + ; ALL-DAG: move $5, $[[T1]] + ; ALL-DAG: lw $6, 20($sp) + ; ALL-DAG: lw $[[T2:[0-9]+]], %got(memmove)(${{[0-9]+}}) + ; ALL: jalr $[[T2]] + ; ALL-NEXT: nop + ; ALL-NOT: {{.*}}$2{{.*}} + call void @llvm.memmove.p0i8.p0i8.i32(i8* getelementptr inbounds ([50 x i8], [50 x i8]* @dest, i32 0, i32 0), + i8* %src, i32 %i, i32 1, i1 false) + ret void +} + +define void @clear(i32 %i) { + ; ALL-LABEL: clear: + + ; ALL-DAG: lw $[[T0:[0-9]+]], %got(dest)(${{[0-9]+}}) + ; ALL-DAG: sw $4, 16($sp) + ; ALL-DAG: move $4, $[[T0]] + ; ALL-DAG: addiu $[[T1:[0-9]+]], $zero, 42 + ; 32R1-DAG: sll $[[T2:[0-9]+]], $[[T1]], 24 + ; 32R1-DAG: sra $5, $[[T2]], 24 + ; 32R2-DAG: seb $5, $[[T1]] + ; ALL-DAG: lw $6, 16($sp) + ; ALL-DAG: lw $[[T2:[0-9]+]], %got(memset)(${{[0-9]+}}) + ; ALL: jalr $[[T2]] + ; ALL-NEXT: nop + ; ALL-NOT: {{.*}}$2{{.*}} + call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([50 x i8], [50 x i8]* @dest, i32 0, i32 0), + i8 42, i32 %i, i32 1, i1 false) + ret void +} diff --git a/test/CodeGen/Mips/Fast-ISel/mul1.ll b/test/CodeGen/Mips/Fast-ISel/mul1.ll new file mode 100644 index 000000000000..0ee044bea0a7 --- /dev/null +++ b/test/CodeGen/Mips/Fast-ISel/mul1.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -march=mipsel -mcpu=mips32 -O0 \ +; RUN: -fast-isel -mips-fast-isel -relocation-model=pic +; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 \ +; RUN: -fast-isel -mips-fast-isel -relocation-model=pic + +; The test is just to make sure it is able to allocate +; registers for this example. There was an issue with allocating AC0 +; after a mul instruction. + +declare { i32, i1 } @llvm.smul.with.overflow.i32(i32, i32) + +define i32 @foo(i32 %a, i32 %b) { +entry: + %0 = mul i32 %a, %b + %1 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %0, i32 %b) + %2 = extractvalue { i32, i1 } %1, 0 + ret i32 %2 +} diff --git a/test/CodeGen/Mips/Fast-ISel/rem1.ll b/test/CodeGen/Mips/Fast-ISel/rem1.ll new file mode 100644 index 000000000000..9b5e440d0eaa --- /dev/null +++ b/test/CodeGen/Mips/Fast-ISel/rem1.ll @@ -0,0 +1,56 @@ +; RUN: llc < %s -march=mipsel -mcpu=mips32 -O0 -relocation-model=pic \ +; RUN: -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s +; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 -relocation-model=pic \ +; RUN: -fast-isel=true -mips-fast-isel -fast-isel-abort=1 | FileCheck %s + +@sj = global i32 200, align 4 +@sk = global i32 -47, align 4 +@uj = global i32 200, align 4 +@uk = global i32 43, align 4 +@si = common global i32 0, align 4 +@ui = common global i32 0, align 4 + +define void @rems() { + ; CHECK-LABEL: rems: + + ; CHECK: lui $[[GOT1:[0-9]+]], %hi(_gp_disp) + ; CHECK: addiu $[[GOT2:[0-9]+]], $[[GOT1]], %lo(_gp_disp) + ; CHECK: addu $[[GOT:[0-9]+]], $[[GOT2:[0-9]+]], $25 + ; CHECK-DAG: lw $[[I_ADDR:[0-9]+]], %got(si)($[[GOT]]) + ; CHECK-DAG: lw $[[K_ADDR:[0-9]+]], %got(sk)($[[GOT]]) + ; CHECK-DAG: lw $[[J_ADDR:[0-9]+]], %got(sj)($[[GOT]]) + ; CHECK-DAG: lw $[[J:[0-9]+]], 0($[[J_ADDR]]) + ; CHECK-DAG: lw $[[K:[0-9]+]], 0($[[K_ADDR]]) + ; CHECK-DAG: div $zero, $[[J]], $[[K]] + ; CHECK_DAG: teq $[[K]], $zero, 7 + ; CHECK-DAG: mfhi $[[RESULT:[0-9]+]] + ; CHECK: sw $[[RESULT]], 0($[[I_ADDR]]) + %1 = load i32, i32* @sj, align 4 + %2 = load i32, i32* @sk, align 4 + %rem = srem i32 %1, %2 + store i32 %rem, i32* @si, align 4 + ret void +} + +; Function Attrs: noinline nounwind +define void @remu() { + ; CHECK-LABEL: remu: + + ; CHECK: lui $[[GOT1:[0-9]+]], %hi(_gp_disp) + ; CHECK: addiu $[[GOT2:[0-9]+]], $[[GOT1]], %lo(_gp_disp) + ; CHECK: addu $[[GOT:[0-9]+]], $[[GOT2:[0-9]+]], $25 + ; CHECK-DAG: lw $[[I_ADDR:[0-9]+]], %got(ui)($[[GOT]]) + ; CHECK-DAG: lw $[[K_ADDR:[0-9]+]], %got(uk)($[[GOT]]) + ; CHECK-DAG: lw $[[J_ADDR:[0-9]+]], %got(uj)($[[GOT]]) + ; CHECK-DAG: lw $[[J:[0-9]+]], 0($[[J_ADDR]]) + ; CHECK-DAG: lw $[[K:[0-9]+]], 0($[[K_ADDR]]) + ; CHECK-DAG: divu $zero, $[[J]], $[[K]] + ; CHECK_DAG: teq $[[K]], $zero, 7 + ; CHECK-DAG: mfhi $[[RESULT:[0-9]+]] + ; CHECK: sw $[[RESULT]], 0($[[I_ADDR]]) + %1 = load i32, i32* @uj, align 4 + %2 = load i32, i32* @uk, align 4 + %rem = urem i32 %1, %2 + store i32 %rem, i32* @ui, align 4 + ret void +} diff --git a/test/CodeGen/Mips/Fast-ISel/sel1.ll b/test/CodeGen/Mips/Fast-ISel/sel1.ll new file mode 100644 index 000000000000..47b6a895cde8 --- /dev/null +++ b/test/CodeGen/Mips/Fast-ISel/sel1.ll @@ -0,0 +1,91 @@ +; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O2 -relocation-model=pic \ +; RUN: -fast-isel -mips-fast-isel -fast-isel-abort=1 | FileCheck %s + +define i1 @sel_i1(i1 %j, i1 %k, i1 %l) { +entry: + ; CHECK-LABEL: sel_i1: + + ; FIXME: The following instruction is redundant. + ; CHECK: xor $[[T0:[0-9]+]], $4, $zero + ; CHECK-NEXT: sltu $[[T1:[0-9]+]], $zero, $[[T0]] + ; CHECK-NEXT: movn $6, $5, $[[T1]] + ; CHECK: move $2, $6 + %cond = icmp ne i1 %j, 0 + %res = select i1 %cond, i1 %k, i1 %l + ret i1 %res +} + +define i8 @sel_i8(i8 %j, i8 %k, i8 %l) { +entry: + ; CHECK-LABEL: sel_i8: + + ; CHECK-DAG: seb $[[T0:[0-9]+]], $4 + ; FIXME: The following 2 instructions are redundant. + ; CHECK-DAG: seb $[[T1:[0-9]+]], $zero + ; CHECK: xor $[[T2:[0-9]+]], $[[T0]], $[[T1]] + ; CHECK-NEXT: sltu $[[T3:[0-9]+]], $zero, $[[T2]] + ; CHECK-NEXT: movn $6, $5, $[[T3]] + ; CHECK: move $2, $6 + %cond = icmp ne i8 %j, 0 + %res = select i1 %cond, i8 %k, i8 %l + ret i8 %res +} + +define i16 @sel_i16(i16 %j, i16 %k, i16 %l) { +entry: + ; CHECK-LABEL: sel_i16: + + ; CHECK-DAG: seh $[[T0:[0-9]+]], $4 + ; FIXME: The following 2 instructions are redundant. + ; CHECK-DAG: seh $[[T1:[0-9]+]], $zero + ; CHECK: xor $[[T2:[0-9]+]], $[[T0]], $[[T1]] + ; CHECK-NEXT: sltu $[[T3:[0-9]+]], $zero, $[[T2]] + ; CHECK-NEXT: movn $6, $5, $[[T3]] + ; CHECK: move $2, $6 + %cond = icmp ne i16 %j, 0 + %res = select i1 %cond, i16 %k, i16 %l + ret i16 %res +} + +define i32 @sel_i32(i32 %j, i32 %k, i32 %l) { +entry: + ; CHECK-LABEL: sel_i32: + + ; FIXME: The following instruction is redundant. + ; CHECK: xor $[[T0:[0-9]+]], $4, $zero + ; CHECK-NEXT: sltu $[[T1:[0-9]+]], $zero, $[[T0]] + ; CHECK-NEXT: movn $6, $5, $[[T1]] + ; CHECK: move $2, $6 + %cond = icmp ne i32 %j, 0 + %res = select i1 %cond, i32 %k, i32 %l + ret i32 %res +} + +define float @sel_float(i32 %j, float %k, float %l) { +entry: + ; CHECK-LABEL: sel_float: + + ; CHECK-DAG: mtc1 $6, $f0 + ; CHECK-DAG: mtc1 $5, $f1 + ; CHECK-DAG: xor $[[T0:[0-9]+]], $4, $zero + ; CHECK: sltu $[[T1:[0-9]+]], $zero, $[[T0]] + ; CHECK: movn.s $f0, $f1, $[[T1]] + %cond = icmp ne i32 %j, 0 + %res = select i1 %cond, float %k, float %l + ret float %res +} + +define double @sel_double(i32 %j, double %k, double %l) { +entry: + ; CHECK-LABEL: sel_double: + + ; CHECK-DAG: mtc1 $6, $f2 + ; CHECK-DAG: mthc1 $7, $f2 + ; CHECK-DAG: ldc1 $f0, 16($sp) + ; CHECK-DAG: xor $[[T0:[0-9]+]], $4, $zero + ; CHECK: sltu $[[T1:[0-9]+]], $zero, $[[T0]] + ; CHECK: movn.d $f0, $f2, $[[T1]] + %cond = icmp ne i32 %j, 0 + %res = select i1 %cond, double %k, double %l + ret double %res +} diff --git a/test/CodeGen/Mips/dynamic-stack-realignment.ll b/test/CodeGen/Mips/dynamic-stack-realignment.ll new file mode 100644 index 000000000000..777930a37ad5 --- /dev/null +++ b/test/CodeGen/Mips/dynamic-stack-realignment.ll @@ -0,0 +1,299 @@ +; RUN: llc < %s -march=mips -mcpu=mips2 | FileCheck %s \ +; RUN: --check-prefix=ALL --check-prefix=GP32 +; RUN: llc < %s -march=mips -mcpu=mips32 | FileCheck %s \ +; RUN: --check-prefix=ALL --check-prefix=GP32 +; RUN: llc < %s -march=mips -mcpu=mips32r6 | FileCheck %s \ +; RUN: --check-prefix=ALL --check-prefix=GP32 +; RUN: llc < %s -march=mips64 -mcpu=mips3 | FileCheck %s \ +; RUN: --check-prefix=ALL --check-prefix=GP64 -check-prefix=N64 +; RUN: llc < %s -march=mips64 -mcpu=mips64 | FileCheck %s \ +; RUN: --check-prefix=ALL --check-prefix=GP64 -check-prefix=N64 +; RUN: llc < %s -march=mips64 -mcpu=mips64r6 | FileCheck %s \ +; RUN: --check-prefix=ALL --check-prefix=GP64 -check-prefix=N64 +; RUN: llc < %s -march=mips64 -mcpu=mips3 -target-abi n32 | FileCheck %s \ +; RUN: --check-prefix=ALL --check-prefix=GP64 -check-prefix=N32 +; RUN: llc < %s -march=mips64 -mcpu=mips64 -target-abi n32 | FileCheck %s \ +; RUN: --check-prefix=ALL --check-prefix=GP64 -check-prefix=N32 +; RUN: llc < %s -march=mips64 -mcpu=mips64r6 -target-abi n32 | FileCheck %s \ +; RUN: --check-prefix=ALL --check-prefix=GP64 -check-prefix=N32 + +; Check dynamic stack realignment in functions without variable-sized objects. + +declare void @helper_01(i32, i32, i32, i32, i32*) + +; O32 ABI +define void @func_01() { +entry: +; GP32-LABEL: func_01: + + ; prologue + ; FIXME: We are currently over-allocating stack space. This particular case + ; needs a frame of up to between 16 and 512-bytes but currently + ; allocates between 1024 and 1536 bytes + ; GP32: addiu $sp, $sp, -1024 + ; GP32: sw $ra, 1020($sp) + ; GP32: sw $fp, 1016($sp) + ; + ; GP32: move $fp, $sp + ; GP32: addiu $[[T0:[0-9]+|ra|gp]], $zero, -512 + ; GP32-NEXT: and $sp, $sp, $[[T0]] + + ; body + ; GP32: addiu $[[T1:[0-9]+]], $sp, 512 + ; GP32: sw $[[T1]], 16($sp) + + ; epilogue + ; GP32: move $sp, $fp + ; GP32: lw $fp, 1016($sp) + ; GP32: lw $ra, 1020($sp) + ; GP32: addiu $sp, $sp, 1024 + + %a = alloca i32, align 512 + call void @helper_01(i32 0, i32 0, i32 0, i32 0, i32* %a) + ret void +} + +declare void @helper_02(i32, i32, i32, i32, + i32, i32, i32, i32, i32*) + +; N32/N64 ABIs +define void @func_02() { +entry: +; GP64-LABEL: func_02: + + ; prologue + ; FIXME: We are currently over-allocating stack space. This particular case + ; needs a frame of up to between 16 and 512-bytes but currently + ; allocates between 1024 and 1536 bytes + ; N32: addiu $sp, $sp, -1024 + ; N64: daddiu $sp, $sp, -1024 + ; GP64: sd $ra, 1016($sp) + ; GP64: sd $fp, 1008($sp) + ; N32: sd $gp, 1000($sp) + ; + ; GP64: move $fp, $sp + ; N32: addiu $[[T0:[0-9]+|ra]], $zero, -512 + ; N64: daddiu $[[T0:[0-9]+|ra]], $zero, -512 + ; GP64-NEXT: and $sp, $sp, $[[T0]] + + ; body + ; N32: addiu $[[T1:[0-9]+]], $sp, 512 + ; N64: daddiu $[[T1:[0-9]+]], $sp, 512 + ; GP64: sd $[[T1]], 0($sp) + + ; epilogue + ; GP64: move $sp, $fp + ; N32: ld $gp, 1000($sp) + ; GP64: ld $fp, 1008($sp) + ; GP64: ld $ra, 1016($sp) + ; N32: addiu $sp, $sp, 1024 + ; N64: daddiu $sp, $sp, 1024 + + %a = alloca i32, align 512 + call void @helper_02(i32 0, i32 0, i32 0, i32 0, + i32 0, i32 0, i32 0, i32 0, i32* %a) + ret void +} + +; Verify that we use $fp for referencing incoming arguments. + +declare void @helper_03(i32, i32, i32, i32, i32*, i32*) + +; O32 ABI +define void @func_03(i32 %p0, i32 %p1, i32 %p2, i32 %p3, i32* %b) { +entry: +; GP32-LABEL: func_03: + + ; body + ; FIXME: We are currently over-allocating stack space. + ; GP32-DAG: addiu $[[T0:[0-9]+]], $sp, 512 + ; GP32-DAG: sw $[[T0]], 16($sp) + ; GP32-DAG: lw $[[T1:[0-9]+]], 1040($fp) + ; GP32-DAG: sw $[[T1]], 20($sp) + + %a = alloca i32, align 512 + call void @helper_03(i32 0, i32 0, i32 0, i32 0, i32* %a, i32* %b) + ret void +} + +declare void @helper_04(i32, i32, i32, i32, + i32, i32, i32, i32, i32*, i32*) + +; N32/N64 ABIs +define void @func_04(i32 %p0, i32 %p1, i32 %p2, i32 %p3, + i32 %p4, i32 %p5, i32 %p6, i32 %p7, + i32* %b) { +entry: +; GP64-LABEL: func_04: + + ; body + ; FIXME: We are currently over-allocating stack space. + ; N32-DAG: addiu $[[T0:[0-9]+]], $sp, 512 + ; N64-DAG: daddiu $[[T0:[0-9]+]], $sp, 512 + ; GP64-DAG: sd $[[T0]], 0($sp) + ; GP64-DAG: ld $[[T1:[0-9]+]], 1024($fp) + ; GP64-DAG: sd $[[T1]], 8($sp) + + %a = alloca i32, align 512 + call void @helper_04(i32 0, i32 0, i32 0, i32 0, + i32 0, i32 0, i32 0, i32 0, i32* %a, i32* %b) + ret void +} + +; Check dynamic stack realignment in functions with variable-sized objects. + +; O32 ABI +define void @func_05(i32 %sz) { +entry: +; GP32-LABEL: func_05: + + ; prologue + ; FIXME: We are currently over-allocating stack space. + ; GP32: addiu $sp, $sp, -1024 + ; GP32: sw $fp, 1020($sp) + ; GP32: sw $23, 1016($sp) + ; + ; GP32: move $fp, $sp + ; GP32: addiu $[[T0:[0-9]+|gp]], $zero, -512 + ; GP32-NEXT: and $sp, $sp, $[[T0]] + ; GP32-NEXT: move $23, $sp + + ; body + ; GP32: addiu $[[T1:[0-9]+]], $zero, 222 + ; GP32: sw $[[T1]], 508($23) + + ; epilogue + ; GP32: move $sp, $fp + ; GP32: lw $23, 1016($sp) + ; GP32: lw $fp, 1020($sp) + ; GP32: addiu $sp, $sp, 1024 + + %a0 = alloca i32, i32 %sz, align 512 + %a1 = alloca i32, align 4 + + store volatile i32 111, i32* %a0, align 512 + store volatile i32 222, i32* %a1, align 4 + + ret void +} + +; N32/N64 ABIs +define void @func_06(i32 %sz) { +entry: +; GP64-LABEL: func_06: + + ; prologue + ; FIXME: We are currently over-allocating stack space. + ; N32: addiu $sp, $sp, -1024 + ; N64: daddiu $sp, $sp, -1024 + ; GP64: sd $fp, 1016($sp) + ; GP64: sd $23, 1008($sp) + ; + ; GP64: move $fp, $sp + ; GP64: addiu $[[T0:[0-9]+|gp]], $zero, -512 + ; GP64-NEXT: and $sp, $sp, $[[T0]] + ; GP64-NEXT: move $23, $sp + + ; body + ; GP64: addiu $[[T1:[0-9]+]], $zero, 222 + ; GP64: sw $[[T1]], 508($23) + + ; epilogue + ; GP64: move $sp, $fp + ; GP64: ld $23, 1008($sp) + ; GP64: ld $fp, 1016($sp) + ; N32: addiu $sp, $sp, 1024 + ; N64: daddiu $sp, $sp, 1024 + + %a0 = alloca i32, i32 %sz, align 512 + %a1 = alloca i32, align 4 + + store volatile i32 111, i32* %a0, align 512 + store volatile i32 222, i32* %a1, align 4 + + ret void +} + +; Verify that we use $fp for referencing incoming arguments and $sp for +; building outbound arguments for nested function calls. + +; O32 ABI +define void @func_07(i32 %p0, i32 %p1, i32 %p2, i32 %p3, i32 %sz) { +entry: +; GP32-LABEL: func_07: + + ; body + ; FIXME: We are currently over-allocating stack space. + ; GP32-DAG: lw $[[T0:[0-9]+]], 1040($fp) + ; + ; GP32-DAG: addiu $[[T1:[0-9]+]], $zero, 222 + ; GP32-DAG: sw $[[T1]], 508($23) + ; + ; GP32-DAG: sw $[[T2:[0-9]+]], 16($sp) + + %a0 = alloca i32, i32 %sz, align 512 + %a1 = alloca i32, align 4 + + store volatile i32 111, i32* %a0, align 512 + store volatile i32 222, i32* %a1, align 4 + + call void @helper_01(i32 0, i32 0, i32 0, i32 0, i32* %a1) + + ret void +} + +; N32/N64 ABIs +define void @func_08(i32 %p0, i32 %p1, i32 %p2, i32 %p3, + i32 %p4, i32 %p5, i32 %p6, i32 %p7, + i32 %sz) { +entry: +; GP64-LABEL: func_08: + + ; body + ; FIXME: We are currently over-allocating stack space. + ; N32-DAG: lw $[[T0:[0-9]+]], 1028($fp) + ; N64-DAG: lwu $[[T0:[0-9]+]], 1028($fp) + ; + ; GP64-DAG: addiu $[[T1:[0-9]+]], $zero, 222 + ; GP64-DAG: sw $[[T1]], 508($23) + ; + ; GP64-DAG: sd $[[T2:[0-9]+]], 0($sp) + + %a0 = alloca i32, i32 %sz, align 512 + %a1 = alloca i32, align 4 + + store volatile i32 111, i32* %a0, align 512 + store volatile i32 222, i32* %a1, align 4 + + call void @helper_02(i32 0, i32 0, i32 0, i32 0, + i32 0, i32 0, i32 0, i32 0, i32* %a1) + ret void +} + +; Check that we do not perform dynamic stack realignment in the presence of +; the "no-realign-stack" function attribute. +define void @func_09() "no-realign-stack" { +entry: +; ALL-LABEL: func_09: + + ; ALL-NOT: and $sp, $sp, $[[T0:[0-9]+|ra|gp]] + + %a = alloca i32, align 512 + call void @helper_01(i32 0, i32 0, i32 0, i32 0, i32* %a) + ret void +} + +define void @func_10(i32 %sz) "no-realign-stack" { +entry: +; ALL-LABEL: func_10: + + ; ALL-NOT: and $sp, $sp, $[[T0:[0-9]+|ra|gp]] + + %a0 = alloca i32, i32 %sz, align 512 + %a1 = alloca i32, align 4 + + store volatile i32 111, i32* %a0, align 512 + store volatile i32 222, i32* %a1, align 4 + + ret void +} diff --git a/test/CodeGen/Mips/ehframe-indirect.ll b/test/CodeGen/Mips/ehframe-indirect.ll index f124881a472f..dc06ef7840ff 100644 --- a/test/CodeGen/Mips/ehframe-indirect.ll +++ b/test/CodeGen/Mips/ehframe-indirect.ll @@ -1,9 +1,11 @@ -; RUN: llc -mtriple=mipsel-linux-gnu < %s | FileCheck -check-prefix=ALL -check-prefix=O32 %s -; RUN: llc -mtriple=mipsel-linux-android < %s | FileCheck -check-prefix=ALL -check-prefix=O32 %s -; RUN: llc -mtriple=mips64el-linux-gnu -target-abi=n32 < %s | FileCheck -check-prefix=ALL -check-prefix=N32 %s -; RUN: llc -mtriple=mips64el-linux-android -target-abi=n32 < %s | FileCheck -check-prefix=ALL -check-prefix=N32 %s -; RUN: llc -mtriple=mips64el-linux-gnu < %s | FileCheck -check-prefix=ALL -check-prefix=N64 %s -; RUN: llc -mtriple=mips64el-linux-android < %s | FileCheck -check-prefix=ALL -check-prefix=N64 %s +; RUN: llc -mtriple=mipsel-linux-gnu < %s -asm-verbose | FileCheck -check-prefix=ALL -check-prefix=O32 %s +; RUN: llc -mtriple=mipsel-linux-android < %s -asm-verbose | FileCheck -check-prefix=ALL -check-prefix=O32 %s +; RUN: llc -mtriple=mips64el-linux-gnu -target-abi=n32 < %s -asm-verbose | FileCheck -check-prefix=ALL -check-prefix=N32 %s +; RUN: llc -mtriple=mips64el-linux-android -target-abi=n32 < %s -asm-verbose | FileCheck -check-prefix=ALL -check-prefix=N32 %s +; RUN: llc -mtriple=mips64el-linux-gnu < %s -asm-verbose | FileCheck -check-prefix=ALL -check-prefix=N64 %s +; RUN: llc -mtriple=mips64el-linux-android < %s -asm-verbose | FileCheck -check-prefix=ALL -check-prefix=N64 %s + +@_ZTISt9exception = external constant i8* define i32 @main() { ; ALL: .cfi_startproc @@ -16,7 +18,9 @@ entry: lpad: %0 = landingpad { i8*, i32 } personality i8* - bitcast (i32 (...)* @__gxx_personality_v0 to i8*) catch i8* null + bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + catch i8* null + catch i8* bitcast (i8** @_ZTISt9exception to i8*) ret i32 0 cont: @@ -28,6 +32,14 @@ declare i32 @__gxx_personality_v0(...) declare void @foo() +; ALL: GCC_except_table{{[0-9]+}}: +; ALL: .byte 155 # @TType Encoding = indirect pcrel sdata4 +; ALL: $[[PC_LABEL:tmp[0-9]+]]: +; ALL: .4byte ($_ZTISt9exception.DW.stub)-($[[PC_LABEL]]) +; ALL: $_ZTISt9exception.DW.stub: +; O32: .4byte _ZTISt9exception +; N32: .4byte _ZTISt9exception +; N64: .8byte _ZTISt9exception ; ALL: .hidden DW.ref.__gxx_personality_v0 ; ALL: .weak DW.ref.__gxx_personality_v0 ; ALL: .section .data.DW.ref.__gxx_personality_v0,"aGw",@progbits,DW.ref.__gxx_personality_v0,comdat diff --git a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll index 3dc1cde77095..779620e10128 100644 --- a/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll +++ b/test/CodeGen/Mips/emergency-spill-slot-near-fp.ll @@ -1,8 +1,8 @@ ; Check that register scavenging spill slot is close to $fp. ; RUN: llc -march=mipsel -O0 < %s | FileCheck %s -; CHECK: sw ${{.*}}, 4($fp) -; CHECK: lw ${{.*}}, 4($fp) +; CHECK: sw ${{.*}}, 4($sp) +; CHECK: lw ${{.*}}, 4($sp) define i32 @main(i32 signext %argc, i8** %argv) "no-frame-pointer-elim"="true" { entry: diff --git a/test/CodeGen/NVPTX/access-non-generic.ll b/test/CodeGen/NVPTX/access-non-generic.ll index e709302918f5..5deefe881e3f 100644 --- a/test/CodeGen/NVPTX/access-non-generic.ll +++ b/test/CodeGen/NVPTX/access-non-generic.ll @@ -85,6 +85,22 @@ define i32 @ld_int_from_float() { ret i32 %1 } +define i32 @ld_int_from_global_float(float addrspace(1)* %input, i32 %i, i32 %j) { +; IR-LABEL: @ld_int_from_global_float( +; PTX-LABEL: ld_int_from_global_float( + %1 = addrspacecast float addrspace(1)* %input to float* + %2 = getelementptr float, float* %1, i32 %i +; IR-NEXT: getelementptr float, float addrspace(1)* %input, i32 %i + %3 = getelementptr float, float* %2, i32 %j +; IR-NEXT: getelementptr float, float addrspace(1)* {{%[^,]+}}, i32 %j + %4 = bitcast float* %3 to i32* +; IR-NEXT: bitcast float addrspace(1)* {{%[^ ]+}} to i32 addrspace(1)* + %5 = load i32, i32* %4 +; IR-NEXT: load i32, i32 addrspace(1)* {{%.+}} +; PTX-LABEL: ld.global + ret i32 %5 +} + declare void @llvm.cuda.syncthreads() #3 attributes #3 = { noduplicate nounwind } diff --git a/test/CodeGen/NVPTX/bug21465.ll b/test/CodeGen/NVPTX/bug21465.ll index 76af386c6516..2eae41f73a0c 100644 --- a/test/CodeGen/NVPTX/bug21465.ll +++ b/test/CodeGen/NVPTX/bug21465.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -nvptx-lower-struct-args -S | FileCheck %s +; RUN: opt < %s -nvptx-lower-kernel-args -S | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" target triple = "nvptx64-unknown-unknown" @@ -8,12 +9,15 @@ target triple = "nvptx64-unknown-unknown" ; Function Attrs: nounwind define void @_Z11TakesStruct1SPi(%struct.S* byval nocapture readonly %input, i32* nocapture %output) #0 { entry: -; CHECK-LABEL @_Z22TakesStruct1SPi -; CHECK: bitcast %struct.S* %input to i8* -; CHECK: call i8 addrspace(101)* @llvm.nvvm.ptr.gen.to.param.p101i8.p0i8 +; CHECK-LABEL: @_Z11TakesStruct1SPi +; PTX-LABEL: .visible .entry _Z11TakesStruct1SPi( +; CHECK: addrspacecast %struct.S* %input to %struct.S addrspace(101)* %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 %0 = load i32, i32* %b, align 4 +; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}] +; PTX: ld.param.u32 [[value:%r[0-9]+]], [{{%rd[0-9]+}}+4] store i32 %0, i32* %output, align 4 +; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]] ret void } diff --git a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll index 58b191129917..c70670da13d6 100644 --- a/test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ b/test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -24,7 +24,10 @@ entry: ; CHECK: cvta.local.u64 %SP, %rd[[BUF_REG]] ; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0] -; CHECK: ld.f32 %f[[A0_REG:[0-9]+]], [%rd[[A_REG]]] +; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]] +; FIXME: casting A1_REG to A2_REG is unnecessary; A2_REG is essentially A_REG +; CHECK: cvta.global.u64 %rd[[A2_REG:[0-9]+]], %rd[[A1_REG]] +; CHECK: ld.global.f32 %f[[A0_REG:[0-9]+]], [%rd[[A1_REG]]] ; CHECK: st.f32 [%SP+0], %f[[A0_REG]] %0 = load float, float* %a, align 4 @@ -48,7 +51,7 @@ entry: ; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0 ; CHECK: .param .b64 param0; -; CHECK-NEXT: st.param.b64 [param0+0], %rd[[A_REG]] +; CHECK-NEXT: st.param.b64 [param0+0], %rd[[A2_REG]] ; CHECK-NEXT: .param .b64 param1; ; CHECK-NEXT: st.param.b64 [param1+0], %rd[[SP_REG]] ; CHECK-NEXT: call.uni diff --git a/test/CodeGen/NVPTX/globals_init.ll b/test/CodeGen/NVPTX/globals_init.ll new file mode 100644 index 000000000000..5b45f410156c --- /dev/null +++ b/test/CodeGen/NVPTX/globals_init.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +; Make sure the globals constant initializers are not prone to host endianess +; issues. + +; CHECK-DAG: .b8 Gbli08[2] = {171, 205}; +@Gbli08 = global [2 x i8] [i8 171, i8 205] + +; CHECK-DAG: .b8 Gbli16[4] = {205, 171, 1, 239}; +@Gbli16 = global [2 x i16] [i16 43981, i16 61185] + +; CHECK-DAG: .b8 Gbli32[8] = {1, 239, 205, 171, 137, 103, 69, 35}; +@Gbli32 = global [2 x i32] [i32 2882400001, i32 591751049] + +; CHECK-DAG: .b8 Gbli64[16] = {137, 103, 69, 35, 1, 239, 205, 171, 239, 205, 171, 137, 103, 69, 35, 1}; +@Gbli64 = global [2 x i64] [i64 12379813738877118345, i64 81985529216486895] + +; CHECK-DAG: .b8 Gblf32[8] = {192, 225, 100, 75, 0, 96, 106, 69}; +@Gblf32 = global [2 x float] [float 1.5e+7, float 3.75e+3] + +; CHECK-DAG: .b8 Gblf64[16] = {116, 10, 181, 48, 134, 62, 230, 58, 106, 222, 138, 98, 204, 250, 200, 75}; +@Gblf64 = global [2 x double] [double 5.75e-25, double 12.25e+56] + diff --git a/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll new file mode 100644 index 000000000000..53220bd905bd --- /dev/null +++ b/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-unknown-unknown" + +; Verify that both %input and %output are converted to global pointers and then +; addrspacecast'ed back to the original type. +define void @kernel(float* %input, float* %output) { +; CHECK-LABEL: .visible .entry kernel( +; CHECK: cvta.to.global.u64 +; CHECK: cvta.to.global.u64 + %1 = load float, float* %input, align 4 +; CHECK: ld.global.f32 + store float %1, float* %output, align 4 +; CHECK: st.global.f32 + ret void +} + +!nvvm.annotations = !{!0} +!0 = !{void (float*, float*)* @kernel, !"kernel", i32 1} diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll index d4f7c3bd210a..934df30a3a7d 100644 --- a/test/CodeGen/NVPTX/pr13291-i1-store.ll +++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll @@ -3,19 +3,19 @@ define ptx_kernel void @t1(i1* %a) { ; PTX32: mov.u16 %rs{{[0-9]+}}, 0; -; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}; +; PTX32-NEXT: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}; ; PTX64: mov.u16 %rs{{[0-9]+}}, 0; -; PTX64-NEXT: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}; +; PTX64-NEXT: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}; store i1 false, i1* %a ret void } define ptx_kernel void @t2(i1* %a, i8* %b) { -; PTX32: ld.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.global.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1; ; PTX32: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1; -; PTX64: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; PTX64: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1; ; PTX64: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1; diff --git a/test/CodeGen/NVPTX/surf-read-cuda.ll b/test/CodeGen/NVPTX/surf-read-cuda.ll index ed021346c0f9..c17c71e01d3e 100644 --- a/test/CodeGen/NVPTX/surf-read-cuda.ll +++ b/test/CodeGen/NVPTX/surf-read-cuda.ll @@ -18,8 +18,8 @@ define void @foo(i64 %img, float* %red, i32 %idx) { ; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]] ; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]] %ret = sitofp i32 %val to float -; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]] -; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]] +; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]] +; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]] store float %ret, float* %red ret void } @@ -37,8 +37,8 @@ define void @bar(float* %red, i32 %idx) { ; SM20: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]] ; SM30: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]] %ret = sitofp i32 %val to float -; SM20: st.f32 [%r{{[0-9]+}}], %f[[REDF]] -; SM30: st.f32 [%r{{[0-9]+}}], %f[[REDF]] +; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]] +; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[REDF]] store float %ret, float* %red ret void } diff --git a/test/CodeGen/NVPTX/tex-read-cuda.ll b/test/CodeGen/NVPTX/tex-read-cuda.ll index c5b5600de874..d5f7c1667f17 100644 --- a/test/CodeGen/NVPTX/tex-read-cuda.ll +++ b/test/CodeGen/NVPTX/tex-read-cuda.ll @@ -16,8 +16,8 @@ define void @foo(i64 %img, float* %red, i32 %idx) { ; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXREG]], {%r{{[0-9]+}}}] %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %img, i32 %idx) %ret = extractvalue { float, float, float, float } %val, 0 -; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]] -; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]] +; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[RED]] +; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[RED]] store float %ret, float* %red ret void } @@ -34,8 +34,8 @@ define void @bar(float* %red, i32 %idx) { ; SM30: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [%rd[[TEXHANDLE]], {%r{{[0-9]+}}}] %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx) %ret = extractvalue { float, float, float, float } %val, 0 -; SM20: st.f32 [%r{{[0-9]+}}], %f[[RED]] -; SM30: st.f32 [%r{{[0-9]+}}], %f[[RED]] +; SM20: st.global.f32 [%r{{[0-9]+}}], %f[[RED]] +; SM30: st.global.f32 [%r{{[0-9]+}}], %f[[RED]] store float %ret, float* %red ret void } diff --git a/test/CodeGen/PowerPC/fma.ll b/test/CodeGen/PowerPC/fma.ll index ab5251b2a554..9cfef398edfd 100644 --- a/test/CodeGen/PowerPC/fma.ll +++ b/test/CodeGen/PowerPC/fma.ll @@ -1,9 +1,12 @@ ; RUN: llc < %s -march=ppc32 -fp-contract=fast -mattr=-vsx | FileCheck %s ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 | FileCheck -check-prefix=CHECK-VSX %s +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mcpu=pwr8 | FileCheck -check-prefix=CHECK-P8 %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -fp-contract=fast -mcpu=pwr8 | FileCheck -check-prefix=CHECK-P8 %s declare double @dummy1(double) #0 declare double @dummy2(double, double) #0 declare double @dummy3(double, double, double) #0 +declare float @dummy4(float, float) #0 define double @test_FMADD1(double %A, double %B, double %C) { %D = fmul double %A, %B ; <double> [#uses=1] @@ -126,3 +129,83 @@ define float @test_FNMSUBS(float %A, float %B, float %C) { ; CHECK-VSX: fnmsubs ; CHECK-VSX-NEXT: blr } + +define float @test_XSMADDMSP(float %A, float %B, float %C) { + %D = fmul float %A, %B ; <float> [#uses=1] + %E = fadd float %C, %D ; <float> [#uses=1] + ret float %E +; CHECK-P8-LABEL: test_XSMADDMSP: +; CHECK-P8: xsmaddmsp +; CHECK-P8-NEXT: blr +} + +define float @test_XSMSUBMSP(float %A, float %B, float %C) { + %D = fmul float %A, %B ; <float> [#uses=1] + %E = fsub float %D, %C ; <float> [#uses=1] + ret float %E +; CHECK-P8-LABEL: test_XSMSUBMSP: +; CHECK-P8: xsmsubmsp +; CHECK-P8-NEXT: blr +} + +define float @test_XSMADDASP(float %A, float %B, float %C, float %D) { + %E = fmul float %A, %B ; <float> [#uses=2] + %F = fadd float %E, %C ; <float> [#uses=1] + %G = fsub float %E, %D ; <float> [#uses=1] + %H = call float @dummy4(float %F, float %G) ; <float> [#uses=1] + ret float %H +; CHECK-P8-LABEL: test_XSMADDASP: +; CHECK-P8: xsmaddasp +; CHECK-P8-NEXT: xsmsubmsp +} + +define float @test_XSMSUBASP(float %A, float %B, float %C, float %D) { + %E = fmul float %A, %B ; <float> [#uses=2] + %F = fsub float %E, %C ; <float> [#uses=1] + %G = fsub float %E, %D ; <float> [#uses=1] + %H = call float @dummy4(float %F, float %G) ; <float> [#uses=1] + ret float %H +; CHECK-P8-LABEL: test_XSMSUBASP: +; CHECK-P8: xsmsubasp +; CHECK-P8-NEXT: xsmsubmsp +} + +define float @test_XSNMADDMSP(float %A, float %B, float %C) { + %D = fmul float %A, %B ; <float> [#uses=1] + %E = fadd float %D, %C ; <float> [#uses=1] + %F = fsub float -0.000000e+00, %E ; <float> [#uses=1] + ret float %F +; CHECK-P8-LABEL: test_XSNMADDMSP: +; CHECK-P8: xsnmaddmsp +; CHECK-P8-NEXT: blr +} + +define float @test_XSNMSUBMSP(float %A, float %B, float %C) { + %D = fmul float %A, %B ; <float> [#uses=1] + %E = fsub float %D, %C ; <float> [#uses=1] + %F = fsub float -0.000000e+00, %E ; <float> [#uses=1] + ret float %F +; CHECK-P8-LABEL: test_XSNMSUBMSP: +; CHECK-P8: xsnmsubmsp +; CHECK-P8-NEXT: blr +} + +define float @test_XSNMADDASP(float %A, float %B, float %C) { + %D = fmul float %A, %B ; <float> [#uses=1] + %E = fadd float %D, %C ; <float> [#uses=1] + %F = fsub float -0.000000e+00, %E ; <float> [#uses=1] + %H = call float @dummy4(float %E, float %F) ; <float> [#uses=1] + ret float %F +; CHECK-P8-LABEL: test_XSNMADDASP: +; CHECK-P8: xsnmaddasp +} + +define float @test_XSNMSUBASP(float %A, float %B, float %C) { + %D = fmul float %A, %B ; <float> [#uses=1] + %E = fsub float %D, %C ; <float> [#uses=1] + %F = fsub float -0.000000e+00, %E ; <float> [#uses=1] + %H = call float @dummy4(float %E, float %F) ; <float> [#uses=1] + ret float %F +; CHECK-P8-LABEL: test_XSNMSUBASP: +; CHECK-P8: xsnmsubasp +} diff --git a/test/CodeGen/PowerPC/vsx-fma-sp.ll b/test/CodeGen/PowerPC/vsx-fma-sp.ll new file mode 100644 index 000000000000..1c3e457f92cb --- /dev/null +++ b/test/CodeGen/PowerPC/vsx-fma-sp.ll @@ -0,0 +1,167 @@ +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx -fast-isel -O0 | FileCheck -check-prefix=CHECK-FISL %s +define void @test1sp(float %a, float %b, float %c, float %e, float* nocapture %d) #0 { +entry: + %0 = tail call float @llvm.fma.f32(float %b, float %c, float %a) + store float %0, float* %d, align 4 + %1 = tail call float @llvm.fma.f32(float %b, float %e, float %a) + %arrayidx1 = getelementptr inbounds float, float* %d, i64 1 + store float %1, float* %arrayidx1, align 4 + ret void + +; CHECK-LABEL: @test1sp +; CHECK-DAG: li [[C1:[0-9]+]], 4 +; CHECK-DAG: xsmaddmsp 3, 2, 1 +; CHECK-DAG: xsmaddasp 1, 2, 4 +; CHECK-DAG: stxsspx 3, 0, 7 +; CHECK-DAG: stxsspx 1, 7, [[C1]] +; CHECK: blr + +; CHECK-FISL-LABEL: @test1sp +; CHECK-FISL-DAG: fmr 0, 1 +; CHECK-FISL-DAG: xsmaddasp 0, 2, 3 +; CHECK-FISL-DAG: stxsspx 0, 0, 7 +; CHECK-FISL-DAG: xsmaddasp 1, 2, 4 +; CHECK-FISL-DAG: li [[C1:[0-9]+]], 4 +; CHECK-FISL-DAG: stxsspx 1, 7, [[C1]] +; CHECK-FISL: blr +} + +define void @test2sp(float %a, float %b, float %c, float %e, float %f, float* nocapture %d) #0 { +entry: + %0 = tail call float @llvm.fma.f32(float %b, float %c, float %a) + store float %0, float* %d, align 4 + %1 = tail call float @llvm.fma.f32(float %b, float %e, float %a) + %arrayidx1 = getelementptr inbounds float, float* %d, i64 1 + store float %1, float* %arrayidx1, align 4 + %2 = tail call float @llvm.fma.f32(float %b, float %f, float %a) + %arrayidx2 = getelementptr inbounds float, float* %d, i64 2 + store float %2, float* %arrayidx2, align 4 + ret void + +; CHECK-LABEL: @test2sp +; CHECK-DAG: li [[C1:[0-9]+]], 4 +; CHECK-DAG: li [[C2:[0-9]+]], 8 +; CHECK-DAG: xsmaddmsp 3, 2, 1 +; CHECK-DAG: xsmaddmsp 4, 2, 1 +; CHECK-DAG: xsmaddasp 1, 2, 5 +; CHECK-DAG: stxsspx 3, 0, 8 +; CHECK-DAG: stxsspx 4, 8, [[C1]] +; CHECK-DAG: stxsspx 1, 8, [[C2]] +; CHECK: blr + +; CHECK-FISL-LABEL: @test2sp +; CHECK-FISL-DAG: fmr 0, 1 +; CHECK-FISL-DAG: xsmaddasp 0, 2, 3 +; CHECK-FISL-DAG: stxsspx 0, 0, 8 +; CHECK-FISL-DAG: fmr 0, 1 +; CHECK-FISL-DAG: xsmaddasp 0, 2, 4 +; CHECK-FISL-DAG: li [[C1:[0-9]+]], 4 +; CHECK-FISL-DAG: stxsspx 0, 8, [[C1]] +; CHECK-FISL-DAG: xsmaddasp 1, 2, 5 +; CHECK-FISL-DAG: li [[C2:[0-9]+]], 8 +; CHECK-FISL-DAG: stxsspx 1, 8, [[C2]] +; CHECK-FISL: blr +} + +define void @test3sp(float %a, float %b, float %c, float %e, float %f, float* nocapture %d) #0 { +entry: + %0 = tail call float @llvm.fma.f32(float %b, float %c, float %a) + store float %0, float* %d, align 4 + %1 = tail call float @llvm.fma.f32(float %b, float %e, float %a) + %2 = tail call float @llvm.fma.f32(float %b, float %c, float %1) + %arrayidx1 = getelementptr inbounds float, float* %d, i64 3 + store float %2, float* %arrayidx1, align 4 + %3 = tail call float @llvm.fma.f32(float %b, float %f, float %a) + %arrayidx2 = getelementptr inbounds float, float* %d, i64 2 + store float %3, float* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds float, float* %d, i64 1 + store float %1, float* %arrayidx3, align 4 + ret void + +; CHECK-LABEL: @test3sp +; CHECK-DAG: fmr [[F1:[0-9]+]], 1 +; CHECK-DAG: li [[C1:[0-9]+]], 12 +; CHECK-DAG: li [[C2:[0-9]+]], 8 +; CHECK-DAG: li [[C3:[0-9]+]], 4 +; CHECK-DAG: xsmaddmsp 4, 2, 1 +; CHECK-DAG: xsmaddasp 1, 2, 5 + +; Note: We could convert this next FMA to M-type as well, but it would require +; re-ordering the instructions. +; CHECK-DAG: xsmaddasp [[F1]], 2, 3 + +; CHECK-DAG: xsmaddmsp 3, 2, 4 +; CHECK-DAG: stxsspx [[F1]], 0, 8 +; CHECK-DAG: stxsspx 3, 8, [[C1]] +; CHECK-DAG: stxsspx 1, 8, [[C2]] +; CHECK-DAG: stxsspx 4, 8, [[C3]] +; CHECK: blr + +; CHECK-FISL-LABEL: @test3sp +; CHECK-FISL-DAG: fmr [[F1:[0-9]+]], 1 +; CHECK-FISL-DAG: xsmaddasp [[F1]], 2, 4 +; CHECK-FISL-DAG: fmr 4, [[F1]] +; CHECK-FISL-DAG: xsmaddasp 4, 2, 3 +; CHECK-FISL-DAG: li [[C1:[0-9]+]], 12 +; CHECK-FISL-DAG: stxsspx 4, 8, [[C1]] +; CHECK-FISL-DAG: xsmaddasp 1, 2, 5 +; CHECK-FISL-DAG: li [[C2:[0-9]+]], 8 +; CHECK-FISL-DAG: stxsspx 1, 8, [[C2]] +; CHECK-FISL-DAG: li [[C3:[0-9]+]], 4 +; CHECK-FISL-DAG: stxsspx 0, 8, [[C3]] +; CHECK-FISL: blr +} + +define void @test4sp(float %a, float %b, float %c, float %e, float %f, float* nocapture %d) #0 { +entry: + %0 = tail call float @llvm.fma.f32(float %b, float %c, float %a) + store float %0, float* %d, align 4 + %1 = tail call float @llvm.fma.f32(float %b, float %e, float %a) + %arrayidx1 = getelementptr inbounds float, float* %d, i64 1 + store float %1, float* %arrayidx1, align 4 + %2 = tail call float @llvm.fma.f32(float %b, float %c, float %1) + %arrayidx3 = getelementptr inbounds float, float* %d, i64 3 + store float %2, float* %arrayidx3, align 4 + %3 = tail call float @llvm.fma.f32(float %b, float %f, float %a) + %arrayidx4 = getelementptr inbounds float, float* %d, i64 2 + store float %3, float* %arrayidx4, align 4 + ret void + +; CHECK-LABEL: @test4sp +; CHECK-DAG: fmr [[F1:[0-9]+]], 1 +; CHECK-DAG: li [[C1:[0-9]+]], 4 +; CHECK-DAG: li [[C2:[0-9]+]], 8 +; CHECK-DAG: xsmaddmsp 4, 2, 1 + +; Note: We could convert this next FMA to M-type as well, but it would require +; re-ordering the instructions. +; CHECK-DAG: xsmaddasp 1, 2, 5 + +; CHECK-DAG: xsmaddasp [[F1]], 2, 3 +; CHECK-DAG: stxsspx [[F1]], 0, 8 +; CHECK-DAG: stxsspx 4, 8, [[C1]] +; CHECK-DAG: li [[C3:[0-9]+]], 12 +; CHECK-DAG: xsmaddasp 4, 2, 3 +; CHECK-DAG: stxsspx 4, 8, [[C3]] +; CHECK-DAG: stxsspx 1, 8, [[C2]] +; CHECK: blr + +; CHECK-FISL-LABEL: @test4sp +; CHECK-FISL-DAG: fmr [[F1:[0-9]+]], 1 +; CHECK-FISL-DAG: xsmaddasp [[F1]], 2, 3 +; CHECK-FISL-DAG: stxsspx 0, 0, 8 +; CHECK-FISL-DAG: fmr [[F1]], 1 +; CHECK-FISL-DAG: xsmaddasp [[F1]], 2, 4 +; CHECK-FISL-DAG: li [[C3:[0-9]+]], 4 +; CHECK-FISL-DAG: stxsspx 0, 8, [[C3]] +; CHECK-FISL-DAG: xsmaddasp 0, 2, 3 +; CHECK-FISL-DAG: li [[C1:[0-9]+]], 12 +; CHECK-FISL-DAG: stxsspx 0, 8, [[C1]] +; CHECK-FISL-DAG: xsmaddasp 1, 2, 5 +; CHECK-FISL-DAG: li [[C2:[0-9]+]], 8 +; CHECK-FISL-DAG: stxsspx 1, 8, [[C2]] +; CHECK-FISL: blr +} + +declare float @llvm.fma.f32(float, float, float) #0 diff --git a/test/CodeGen/R600/cgp-addressing-modes.ll b/test/CodeGen/R600/cgp-addressing-modes.ll new file mode 100644 index 000000000000..3d36bd19937e --- /dev/null +++ b/test/CodeGen/R600/cgp-addressing-modes.ll @@ -0,0 +1,242 @@ +; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown < %s | FileCheck -check-prefix=OPT %s +; RUN: llc -march=amdgcn -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN %s + +declare i32 @llvm.r600.read.tidig.x() #0 + +; OPT-LABEL: @test_sink_global_small_offset_i32( +; OPT-NOT: getelementptr i32, i32 addrspace(1)* %in +; OPT: br i1 +; OPT: ptrtoint + +; GCN-LABEL: {{^}}test_sink_global_small_offset_i32: +; GCN: {{^}}BB0_2: +define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(1)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_global_small_max_i32_ds_offset( +; OPT: %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 +; OPT: br i1 + +; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset: +; GCN: s_and_saveexec_b64 +; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; GCN: {{^}}BB1_2: +; GCN: s_or_b64 exec +define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 + %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i8, i8 addrspace(1)* %in.gep + %tmp2 = sext i8 %tmp1 to i32 + br label %endif + +endif: + %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; GCN-LABEL: {{^}}test_sink_global_small_max_mubuf_offset: +; GCN: s_and_saveexec_b64 +; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}} +; GCN: {{^}}BB2_2: +; GCN: s_or_b64 exec +define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024 + %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i8, i8 addrspace(1)* %in.gep + %tmp2 = sext i8 %tmp1 to i32 + br label %endif + +endif: + %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; GCN-LABEL: {{^}}test_sink_global_small_max_plus_1_mubuf_offset: +; GCN: s_and_saveexec_b64 +; GCN: buffer_load_sbyte {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; GCN: {{^}}BB3_2: +; GCN: s_or_b64 exec +define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999 + %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i8, i8 addrspace(1)* %in.gep + %tmp2 = sext i8 %tmp1 to i32 + br label %endif + +endif: + %x = phi i32 [ %tmp2, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_no_sink_flat_small_offset_i32( +; OPT: getelementptr i32, i32 addrspace(4)* %in +; OPT: br i1 +; OPT-NOT: ptrtoint + +; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32: +; GCN: flat_load_dword +; GCN: {{^}}BB4_2: + +define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) { +entry: + %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(4)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(4)* %out.gep + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_sink_scratch_small_offset_i32( +; OPT-NOT: getelementptr [512 x i32] +; OPT: br i1 +; OPT: ptrtoint + +; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: +; GCN: s_and_saveexec_b64 +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: {{^}}BB5_2: +define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { +entry: + %alloca = alloca [512 x i32], align 4 + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %add.arg = add i32 %arg, 8 + %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + store volatile i32 123, i32* %alloca.gep + %tmp1 = load volatile i32, i32* %alloca.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep.0 + %load = load volatile i32, i32* %alloca.gep + store i32 %load, i32 addrspace(1)* %out.gep.1 + br label %done + +done: + ret void +} + +; OPT-LABEL: @test_no_sink_scratch_large_offset_i32( +; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024 +; OPT: br i1 +; OPT-NOT: ptrtoint + +; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32: +; GCN: s_and_saveexec_b64 +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; GCN: {{^}}BB6_2: +define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond, i32 %arg) { +entry: + %alloca = alloca [512 x i32], align 4 + %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998 + %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %add.arg = add i32 %arg, 8 + %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024 + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + store volatile i32 123, i32* %alloca.gep + %tmp1 = load volatile i32, i32* %alloca.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep.0 + %load = load volatile i32, i32* %alloca.gep + store i32 %load, i32 addrspace(1)* %out.gep.1 + br label %done + +done: + ret void +} + +; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32: +; GCN: s_and_saveexec_b64 +; GCN: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN: {{^}}BB7_2: +define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset, i32 %cond) { +entry: + %offset.ext = zext i32 %offset to i64 + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999 + %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 %offset.ext + %tmp0 = icmp eq i32 %cond, 0 + br i1 %tmp0, label %endif, label %if + +if: + %tmp1 = load i32, i32 addrspace(1)* %in.gep + br label %endif + +endif: + %x = phi i32 [ %tmp1, %if ], [ 0, %entry ] + store i32 %x, i32 addrspace(1)* %out.gep + br label %done + +done: + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/R600/coalescer_remat.ll b/test/CodeGen/R600/coalescer_remat.ll index f78a77b36154..96730bcf2e8f 100644 --- a/test/CodeGen/R600/coalescer_remat.ll +++ b/test/CodeGen/R600/coalescer_remat.ll @@ -1,5 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -o - %s | FileCheck %s -target triple="amdgcn--" +; RUN: llc -march=amdgcn -verify-machineinstrs -mtriple=amdgcn-- -o - %s | FileCheck %s declare float @llvm.fma.f32(float, float, float) @@ -12,7 +11,8 @@ declare float @llvm.fma.f32(float, float, float) ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 -; CHECK: ; NumVgprs: 12 +; It's probably OK if this is slightly higher: +; CHECK: ; NumVgprs: 9 define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { entry: %cmpflag = icmp eq i32 %flag, 1 diff --git a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll index 0aecc189e0bf..585172092676 100644 --- a/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll +++ b/test/CodeGen/R600/codegen-prepare-addrmode-sext.ll @@ -1,12 +1,10 @@ -; RUN: opt -codegenprepare -S -o - %s | FileCheck --check-prefix=OPT %s -; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-LLC %s +; RUN: opt -mtriple=amdgcn-- -codegenprepare -S < %s | FileCheck -check-prefix=OPT %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI-LLC %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" -target triple = "r600--" - -; OPT-LABEL: @test +; OPT-LABEL: @test( ; OPT: mul nsw i32 ; OPT-NEXT: sext + ; SI-LLC-LABEL: {{^}}test: ; SI-LLC: s_mul_i32 ; SI-LLC-NOT: mul diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll index 68ebc4dedfe0..addc409c9eb1 100644 --- a/test/CodeGen/R600/fmul.ll +++ b/test/CodeGen/R600/fmul.ll @@ -73,4 +73,20 @@ define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 { ret void } +; There should be three multiplies here; %a should be used twice (once +; negated), not duplicated into mul x, 5.0 and mul x, -5.0. +; FUNC-LABEL: {{^}}test_mul_twouse: +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI: v_mul_f32 +; SI-NOT: v_mul_f32 +define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 { + %a = fmul float %x, 5.0 + %b = fsub float -0.0, %a + %c = fmul float %b, %y + %d = fmul float %c, %a + store float %d, float addrspace(1)* %out + ret void +} + attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } diff --git a/test/CodeGen/R600/half.ll b/test/CodeGen/R600/half.ll index 42ee788e88d5..bf8f11860b50 100644 --- a/test/CodeGen/R600/half.ll +++ b/test/CodeGen/R600/half.ll @@ -1,62 +1,525 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -define void @test_load_store(half addrspace(1)* %in, half addrspace(1)* %out) { -; CHECK-LABEL: {{^}}test_load_store: -; CHECK: buffer_load_ushort [[TMP:v[0-9]+]] -; CHECK: buffer_store_short [[TMP]] +; half args should be promoted to float + +; GCN-LABEL: {{^}}load_f16_arg: +; GCN: s_load_dword [[ARG:s[0-9]+]] +; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] +; GCN: buffer_store_short [[CVT]] +define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { + store half %arg, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}load_v2f16_arg: +; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 +; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 +; GCN-DAG: buffer_store_short [[V0]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN-DAG: buffer_store_short [[V1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}} +; GCN: s_endpgm +define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { + store <2 x half> %arg, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}load_v3f16_arg: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN-NOT: buffer_load +; GCN-DAG: buffer_store_dword +; GCN-DAG: buffer_store_short +; GCN-NOT: buffer_store +; GCN: s_endpgm +define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { + store <3 x half> %arg, <3 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}load_v4f16_arg: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { + store <4 x half> %arg, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}load_v8f16_arg: +define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { + store <8 x half> %arg, <8 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v2f16_arg: +define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { + %fpext = fpext <2 x half> %in to <2 x float> + store <2 x float> %fpext, <2 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_f16_to_f32_arg: +define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { + %ext = fpext half %arg to float + store float %ext, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: +define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { + %ext = fpext <2 x half> %arg to <2 x float> + store <2 x float> %ext, <2 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN: buffer_load_ushort +; GCN-NOT: buffer_load +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 +; GCN-NOT: v_cvt_f32_f16 +; GCN-DAG: buffer_store_dword +; GCN-DAG: buffer_store_dwordx2 +; GCN: s_endpgm +define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { + %ext = fpext <3 x half> %arg to <3 x float> + store <3 x float> %ext, <3 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: +define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { + %ext = fpext <4 x half> %arg to <4 x float> + store <4 x float> %ext, <4 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: +define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { + %ext = fpext <8 x half> %arg to <8 x float> + store <8 x float> %ext, <8 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_f16_to_f64_arg: +define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { + %ext = fpext half %arg to double + store double %ext, double addrspace(1)* %out + ret void +} +; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: +define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { + %ext = fpext <2 x half> %arg to <2 x double> + store <2 x double> %ext, <2 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: +define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { + %ext = fpext <3 x half> %arg to <3 x double> + store <3 x double> %ext, <3 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: +define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { + %ext = fpext <4 x half> %arg to <4 x double> + store <4 x double> %ext, <4 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: +define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { + %ext = fpext <8 x half> %arg to <8 x double> + store <8 x double> %ext, <8 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_load_store_f16: +; GCN: buffer_load_ushort [[TMP:v[0-9]+]] +; GCN: buffer_store_short [[TMP]] +define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %val = load half, half addrspace(1)* %in - store half %val, half addrspace(1) * %out + store half %val, half addrspace(1)* %out ret void } -define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) { -; CHECK-LABEL: {{^}}test_bitcast_from_half: -; CHECK: buffer_load_ushort [[TMP:v[0-9]+]] -; CHECK: buffer_store_short [[TMP]] - %val = load half, half addrspace(1) * %in - %val_int = bitcast half %val to i16 - store i16 %val_int, i16 addrspace(1)* %out +; GCN-LABEL: {{^}}global_load_store_v2f16: +; GCN: buffer_load_dword [[TMP:v[0-9]+]] +; GCN: buffer_store_dword [[TMP]] +define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + store <2 x half> %val, <2 x half> addrspace(1)* %out ret void } -define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) { -; CHECK-LABEL: {{^}}test_bitcast_to_half: -; CHECK: buffer_load_ushort [[TMP:v[0-9]+]] -; CHECK: buffer_store_short [[TMP]] - %val = load i16, i16 addrspace(1)* %in - %val_fp = bitcast i16 %val to half - store half %val_fp, half addrspace(1)* %out +; GCN-LABEL: {{^}}global_load_store_v4f16: +; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx2 [[TMP]] +define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { + %val = load <4 x half>, <4 x half> addrspace(1)* %in + store <4 x half> %val, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_load_store_v8f16: +; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] +; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] +; GCN: s_endpgm +define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { + %val = load <8 x half>, <8 x half> addrspace(1)* %in + store <8 x half> %val, <8 x half> addrspace(1)* %out ret void } -define void @test_extend32(half addrspace(1)* %in, float addrspace(1)* %out) { -; CHECK-LABEL: {{^}}test_extend32: -; CHECK: v_cvt_f32_f16_e32 +; GCN-LABEL: {{^}}global_extload_f16_to_f32: +; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] +; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] +; GCN: buffer_store_dword [[CVT]] +define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { + %val = load half, half addrspace(1)* %in + %cvt = fpext half %val to float + store float %cvt, float addrspace(1)* %out + ret void +} - %val16 = load half, half addrspace(1)* %in - %val32 = fpext half %val16 to float - store float %val32, float addrspace(1)* %out +; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: +define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %cvt = fpext <2 x half> %val to <2 x float> + store <2 x float> %cvt, <2 x float> addrspace(1)* %out ret void } -define void @test_extend64(half addrspace(1)* %in, double addrspace(1)* %out) { -; CHECK-LABEL: {{^}}test_extend64: -; CHECK: v_cvt_f32_f16_e32 -; CHECK: v_cvt_f64_f32_e32 +; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: +define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { + %val = load <3 x half>, <3 x half> addrspace(1)* %in + %cvt = fpext <3 x half> %val to <3 x float> + store <3 x float> %cvt, <3 x float> addrspace(1)* %out + ret void +} - %val16 = load half, half addrspace(1)* %in - %val64 = fpext half %val16 to double - store double %val64, double addrspace(1)* %out +; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: +define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { + %val = load <4 x half>, <4 x half> addrspace(1)* %in + %cvt = fpext <4 x half> %val to <4 x float> + store <4 x float> %cvt, <4 x float> addrspace(1)* %out ret void } -define void @test_trunc32(float addrspace(1)* %in, half addrspace(1)* %out) { -; CHECK-LABEL: {{^}}test_trunc32: -; CHECK: v_cvt_f16_f32_e32 +; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: +define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { + %val = load <8 x half>, <8 x half> addrspace(1)* %in + %cvt = fpext <8 x half> %val to <8 x float> + store <8 x float> %cvt, <8 x float> addrspace(1)* %out + ret void +} - %val32 = load float, float addrspace(1)* %in - %val16 = fptrunc float %val32 to half - store half %val16, half addrspace(1)* %out +; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: +define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { + %val = load <16 x half>, <16 x half> addrspace(1)* %in + %cvt = fpext <16 x half> %val to <16 x float> + store <16 x float> %cvt, <16 x float> addrspace(1)* %out ret void } + +; GCN-LABEL: {{^}}global_extload_f16_to_f64: +; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] +; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] +; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] +; GCN: buffer_store_dwordx2 [[CVT1]] +define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { + %val = load half, half addrspace(1)* %in + %cvt = fpext half %val to double + store double %cvt, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: +define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %cvt = fpext <2 x half> %val to <2 x double> + store <2 x double> %cvt, <2 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: +define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { + %val = load <3 x half>, <3 x half> addrspace(1)* %in + %cvt = fpext <3 x half> %val to <3 x double> + store <3 x double> %cvt, <3 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: +define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { + %val = load <4 x half>, <4 x half> addrspace(1)* %in + %cvt = fpext <4 x half> %val to <4 x double> + store <4 x double> %cvt, <4 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: +define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { + %val = load <8 x half>, <8 x half> addrspace(1)* %in + %cvt = fpext <8 x half> %val to <8 x double> + store <8 x double> %cvt, <8 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: +define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { + %val = load <16 x half>, <16 x half> addrspace(1)* %in + %cvt = fpext <16 x half> %val to <16 x double> + store <16 x double> %cvt, <16 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_f32_to_f16: +; GCN: buffer_load_dword [[LOAD:v[0-9]+]] +; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] +; GCN: buffer_store_short [[CVT]] +define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { + %val = load float, float addrspace(1)* %in + %cvt = fptrunc float %val to half + store half %cvt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: +; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} +; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] +; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] +; GCN-DAG: buffer_store_short [[CVT0]] +; GCN-DAG: buffer_store_short [[CVT1]] +; GCN: s_endpgm +define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { + %val = load <2 x float>, <2 x float> addrspace(1)* %in + %cvt = fptrunc <2 x float> %val to <2 x half> + store <2 x half> %cvt, <2 x half> addrspace(1)* %out + ret void +} + +; FIXME: Shouldn't do 4th conversion +; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: +; GCN: buffer_load_dwordx4 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: buffer_store_short +; GCN: buffer_store_dword +; GCN: s_endpgm +define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { + %val = load <3 x float>, <3 x float> addrspace(1)* %in + %cvt = fptrunc <3 x float> %val to <3 x half> + store <3 x half> %cvt, <3 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: +; GCN: buffer_load_dwordx4 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { + %val = load <4 x float>, <4 x float> addrspace(1)* %in + %cvt = fptrunc <4 x float> %val to <4 x half> + store <4 x half> %cvt, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { + %val = load <8 x float>, <8 x float> addrspace(1)* %in + %cvt = fptrunc <8 x float> %val to <8 x half> + store <8 x half> %cvt, <8 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: v_cvt_f16_f32_e32 +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: buffer_store_short +; GCN: s_endpgm +define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { + %val = load <16 x float>, <16 x float> addrspace(1)* %in + %cvt = fptrunc <16 x float> %val to <16 x half> + store <16 x half> %cvt, <16 x half> addrspace(1)* %out + ret void +} + +; FIXME: Unsafe math should fold conversions away +; GCN-LABEL: {{^}}fadd_f16: +; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, +; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, +; SI: v_add_f32 +; GCN: s_endpgm +define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { + %add = fadd half %a, %b + store half %add, half addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fadd_v2f16: +; SI: v_add_f32 +; SI: v_add_f32 +; GCN: s_endpgm +define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { + %add = fadd <2 x half> %a, %b + store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}fadd_v4f16: +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; GCN: s_endpgm +define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { + %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 + %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 + %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 + %result = fadd <4 x half> %a, %b + store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 + ret void +} + +; GCN-LABEL: {{^}}fadd_v8f16: +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; SI: v_add_f32 +; GCN: s_endpgm +define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { + %add = fadd <8 x half> %a, %b + store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 + ret void +} + +; GCN-LABEL: {{^}}fsub_f16: +; GCN: v_subrev_f32_e32 +; GCN: s_endpgm +define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %b_ptr = getelementptr half, half addrspace(1)* %in, i32 1 + %a = load half, half addrspace(1)* %in + %b = load half, half addrspace(1)* %b_ptr + %sub = fsub half %a, %b + store half %sub, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_from_half: +; GCN: buffer_load_ushort [[TMP:v[0-9]+]] +; GCN: buffer_store_short [[TMP]] +define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { + %val = load half, half addrspace(1)* %in + %val_int = bitcast half %val to i16 + store i16 %val_int, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_bitcast_to_half: +; GCN: buffer_load_ushort [[TMP:v[0-9]+]] +; GCN: buffer_store_short [[TMP]] +define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { + %val = load i16, i16 addrspace(1)* %in + %val_fp = bitcast i16 %val to half + store half %val_fp, half addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/R600/imm.ll b/test/CodeGen/R600/imm.ll index 8917cd6dba33..12eed550eb1f 100644 --- a/test/CodeGen/R600/imm.ll +++ b/test/CodeGen/R600/imm.ll @@ -36,7 +36,7 @@ define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) { ; CHECK-LABEL: {{^}}store_inline_imm_neg_0.0_i32: ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80000000 -; CHECK-NEXT: buffer_store_dword [[REG]] +; CHECK: buffer_store_dword [[REG]] define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) { store i32 -2147483648, i32 addrspace(1)* %out ret void diff --git a/test/CodeGen/R600/loop-address.ll b/test/CodeGen/R600/loop-address.ll index 7fadb8dba7b8..f60d574497de 100644 --- a/test/CodeGen/R600/loop-address.ll +++ b/test/CodeGen/R600/loop-address.ll @@ -1,13 +1,10 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +;RUN: llc < %s -march=r600 -mcpu=redwood < %s | FileCheck %s ;CHECK: ALU_PUSH ;CHECK: LOOP_START_DX10 @11 ;CHECK: LOOP_BREAK @10 ;CHECK: POP @10 -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64" -target triple = "r600--" - define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 { entry: %cmp5 = icmp sgt i32 %iterations, 0 diff --git a/test/CodeGen/R600/loop-idiom.ll b/test/CodeGen/R600/loop-idiom.ll index 810b34fed865..5fd9806813cd 100644 --- a/test/CodeGen/R600/loop-idiom.ll +++ b/test/CodeGen/R600/loop-idiom.ll @@ -2,10 +2,6 @@ ; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=SI -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s ; RUN: opt -basicaa -loop-idiom -S < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs| FileCheck --check-prefix=SI --check-prefix=FUNC %s -target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" -target triple = "r600--" - - ; Make sure loop-idiom doesn't create memcpy or memset. There are no library ; implementations of these for R600. diff --git a/test/CodeGen/R600/max.ll b/test/CodeGen/R600/max.ll index 1aa9e6883011..fef3e2f0a21c 100644 --- a/test/CodeGen/R600/max.ll +++ b/test/CodeGen/R600/max.ll @@ -115,3 +115,54 @@ define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin store i32 %val, i32 addrspace(1)* %out, align 4 ret void } + +; Make sure redundant and removed +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_max_u32 [[MIN:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI-NEXT: buffer_store_dword [[VMIN]] +define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { + %a.ext = zext i16 %a to i32 + %b.ext = zext i16 %b to i32 + %cmp = icmp ugt i32 %a.ext, %b.ext + %val = select i1 %cmp, i32 %a.ext, i32 %b.ext + %mask = and i32 %val, 65535 + store i32 %mask, i32 addrspace(1)* %out + ret void +} + +; Make sure redundant sign_extend_inreg removed. + +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_max_i32 [[MIN:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI-NEXT: buffer_store_dword [[VMIN]] +define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { + %a.ext = sext i16 %a to i32 + %b.ext = sext i16 %b to i32 + %cmp = icmp sgt i32 %a.ext, %b.ext + %val = select i1 %cmp, i32 %a.ext, i32 %b.ext + %shl = shl i32 %val, 16 + %sextinreg = ashr i32 %shl, 16 + store i32 %sextinreg, i32 addrspace(1)* %out + ret void +} + +; FIXME: Should get match min/max through extends inserted by +; legalization. + +; FUNC-LABEL: {{^}}s_test_imin_sge_i16: +; SI: s_sext_i32_i16 +; SI: s_sext_i32_i16 +; SI: v_cmp_ge_i32_e32 +; SI: v_cndmask_b32 +define void @s_test_imin_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp sge i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/min.ll b/test/CodeGen/R600/min.ll index 275e9a7d899b..0332d1a8e407 100644 --- a/test/CodeGen/R600/min.ll +++ b/test/CodeGen/R600/min.ll @@ -136,3 +136,54 @@ define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace store i1 %cmp, i1 addrspace(1)* %outgep1 ret void } + +; Make sure redundant and removed +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI-NEXT: buffer_store_dword [[VMIN]] +define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { + %a.ext = zext i16 %a to i32 + %b.ext = zext i16 %b to i32 + %cmp = icmp ult i32 %a.ext, %b.ext + %val = select i1 %cmp, i32 %a.ext, i32 %b.ext + %mask = and i32 %val, 65535 + store i32 %mask, i32 addrspace(1)* %out + ret void +} + +; Make sure redundant sign_extend_inreg removed. + +; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16: +; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb +; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc +; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]] +; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]] +; SI-NEXT: buffer_store_dword [[VMIN]] +define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { + %a.ext = sext i16 %a to i32 + %b.ext = sext i16 %b to i32 + %cmp = icmp slt i32 %a.ext, %b.ext + %val = select i1 %cmp, i32 %a.ext, i32 %b.ext + %shl = shl i32 %val, 16 + %sextinreg = ashr i32 %shl, 16 + store i32 %sextinreg, i32 addrspace(1)* %out + ret void +} + +; FIXME: Should get match min/max through extends inserted by +; legalization. + +; FUNC-LABEL: {{^}}s_test_imin_sle_i16: +; SI: s_sext_i32_i16 +; SI: s_sext_i32_i16 +; SI: v_cmp_le_i32_e32 +; SI: v_cndmask_b32 +define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { + %cmp = icmp sle i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll index d9ad4935968d..5aedda2ce1a9 100644 --- a/test/CodeGen/R600/sext-in-reg.ll +++ b/test/CodeGen/R600/sext-in-reg.ll @@ -450,13 +450,10 @@ define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x ret void } -; FIXME: The BFE should really be eliminated. I think it should happen -; when computeKnownBitsForTargetNode is implemented for imax. - ; FUNC-LABEL: {{^}}sext_in_reg_to_illegal_type: ; SI: buffer_load_sbyte ; SI: v_max_i32 -; SI: v_bfe_i32 +; SI-NOT: bfe ; SI: buffer_store_short define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) nounwind { %tmp5 = load i8, i8 addrspace(1)* %src, align 1 diff --git a/test/CodeGen/R600/si-vector-hang.ll b/test/CodeGen/R600/si-vector-hang.ll index 94c47fe3c600..bd427dd3ed46 100644 --- a/test/CodeGen/R600/si-vector-hang.ll +++ b/test/CodeGen/R600/si-vector-hang.ll @@ -11,10 +11,7 @@ ; CHECK: buffer_store_byte ; CHECK: buffer_store_byte ; ModuleID = 'radeon' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64" -target triple = "r600--" -; Function Attrs: nounwind define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 { entry: %0 = load i8, i8 addrspace(1)* %in0, align 1 diff --git a/test/CodeGen/R600/subreg-eliminate-dead.ll b/test/CodeGen/R600/subreg-eliminate-dead.ll new file mode 100644 index 000000000000..8bd995a8ecbb --- /dev/null +++ b/test/CodeGen/R600/subreg-eliminate-dead.ll @@ -0,0 +1,19 @@ +; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck %s +; LiveRangeEdit::eliminateDeadDef did not update LiveInterval sub ranges +; properly. + +; Just make sure this test doesn't crash. +; CHECK-LABEL: foobar: +; CHECK: s_endpgm +define void @foobar() { + %v0 = icmp eq <4 x i32> undef, <i32 0, i32 1, i32 2, i32 3> + %v3 = sext <4 x i1> %v0 to <4 x i32> + %v4 = extractelement <4 x i32> %v3, i32 1 + %v5 = icmp ne i32 %v4, 0 + %v6 = select i1 %v5, i32 undef, i32 0 + %v15 = insertelement <2 x i32> undef, i32 %v6, i32 1 + store <2 x i32> %v15, <2 x i32> addrspace(1)* undef, align 8 + ret void +} + +declare double @llvm.fma.f64(double, double, double) diff --git a/test/CodeGen/R600/trunc-store-f64-to-f16.ll b/test/CodeGen/R600/trunc-store-f64-to-f16.ll new file mode 100644 index 000000000000..c29872beef86 --- /dev/null +++ b/test/CodeGen/R600/trunc-store-f64-to-f16.ll @@ -0,0 +1,56 @@ +; XFAIL: * +; RUN: llc -march=amdgcn -mcpu=SI < %s + +; GCN-LABEL: {{^}}global_truncstore_f64_to_f16: +; GCN: s_endpgm +define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 { + %val = load double, double addrspace(1)* %in + %cvt = fptrunc double %val to half + store half %cvt, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16: +; GCN: s_endpgm +define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 { + %val = load <2 x double>, <2 x double> addrspace(1)* %in + %cvt = fptrunc <2 x double> %val to <2 x half> + store <2 x half> %cvt, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16: +; GCN: s_endpgm +define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 { + %val = load <3 x double>, <3 x double> addrspace(1)* %in + %cvt = fptrunc <3 x double> %val to <3 x half> + store <3 x half> %cvt, <3 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16: +; GCN: s_endpgm +define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 { + %val = load <4 x double>, <4 x double> addrspace(1)* %in + %cvt = fptrunc <4 x double> %val to <4 x half> + store <4 x half> %cvt, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16: +; GCN: s_endpgm +define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 { + %val = load <8 x double>, <8 x double> addrspace(1)* %in + %cvt = fptrunc <8 x double> %val to <8 x half> + store <8 x half> %cvt, <8 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16: +; GCN: s_endpgm +define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 { + %val = load <16 x double>, <16 x double> addrspace(1)* %in + %cvt = fptrunc <16 x double> %val to <16 x half> + store <16 x half> %cvt, <16 x half> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/unroll.ll b/test/CodeGen/R600/unroll.ll index ca8d822ec7ed..411a15a4b839 100644 --- a/test/CodeGen/R600/unroll.ll +++ b/test/CodeGen/R600/unroll.ll @@ -1,7 +1,6 @@ -; RUN: opt -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s +; RUN: opt -mtriple=amdgcn-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s +; RUN: opt -mtriple=r600-- -loop-unroll -simplifycfg -sroa %s -S -o - | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64" -target triple = "r600--" ; This test contains a simple loop that initializes an array declared in ; private memory. We want to make sure these kinds of loops are always diff --git a/test/CodeGen/R600/wrong-transalu-pos-fix.ll b/test/CodeGen/R600/wrong-transalu-pos-fix.ll index 5ab465338e15..8b383e4c393d 100644 --- a/test/CodeGen/R600/wrong-transalu-pos-fix.ll +++ b/test/CodeGen/R600/wrong-transalu-pos-fix.ll @@ -1,14 +1,9 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood -mtriple=r600-- < %s | FileCheck %s ; We want all MULLO_INT inst to be last in their instruction group ;CHECK: {{^}}fill3d: ;CHECK-NOT: MULLO_INT T[0-9]+ -; ModuleID = 'radeon' -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64" -target triple = "r600--" - -; Function Attrs: nounwind define void @fill3d(i32 addrspace(1)* nocapture %out) #0 { entry: %x.i = tail call i32 @llvm.r600.read.global.size.x() #1 diff --git a/test/CodeGen/Thumb2/constant-islands-jump-table.ll b/test/CodeGen/Thumb2/constant-islands-jump-table.ll index 0dd7092291ba..5ffe1f9b09f6 100644 --- a/test/CodeGen/Thumb2/constant-islands-jump-table.ll +++ b/test/CodeGen/Thumb2/constant-islands-jump-table.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=thumbv7-linux-gnueabihf -O1 %s -o - | FileCheck %s ; CHECK-LABEL: test_jump_table: -; CHECK: b .LBB +; CHECK: b{{.*}} .LBB ; CHECK-NOT: tbh define i32 @test_jump_table(i32 %x, float %in) { diff --git a/test/CodeGen/Thumb2/float-ops.ll b/test/CodeGen/Thumb2/float-ops.ll index 7ec08f866655..4c42908ce13b 100644 --- a/test/CodeGen/Thumb2/float-ops.ll +++ b/test/CodeGen/Thumb2/float-ops.ll @@ -109,7 +109,7 @@ entry: define double @load_d(double* %a) { entry: ; CHECK-LABEL: load_d: -; NONE: ldm r0, {r0, r1} +; NONE: ldrd r0, r1, [r0] ; HARD: vldr d0, [r0] %0 = load double, double* %a, align 8 ret double %0 @@ -127,9 +127,7 @@ entry: define void @store_d(double* %a, double %b) { entry: ; CHECK-LABEL: store_d: -; NONE: mov r1, r3 -; NONE: str r2, [r0] -; NONE: str r1, [r0, #4] +; NONE: strd r2, r3, [r0] ; HARD: vstr d0, [r0] store double %b, double* %a, align 8 ret void diff --git a/test/CodeGen/Thumb2/thumb2-tbh.ll b/test/CodeGen/Thumb2/thumb2-tbh.ll index a5a5ed0c8da2..0761ed589a26 100644 --- a/test/CodeGen/Thumb2/thumb2-tbh.ll +++ b/test/CodeGen/Thumb2/thumb2-tbh.ll @@ -14,9 +14,19 @@ declare void @Z_fatal(i8*) noreturn nounwind declare noalias i8* @calloc(i32, i32) nounwind +; Jump tables are not anchored next to the TBB/TBH any more. Make sure the +; correct address is still calculated (i.e. via a PC-relative symbol *at* the +; TBB/TBH). define i32 @main(i32 %argc, i8** nocapture %argv) nounwind { ; CHECK-LABEL: main: -; CHECK: tbb +; CHECK-NOT: adr {{r[0-9]+}}, LJTI +; CHECK: [[PCREL_ANCHOR:LCPI[0-9]+_[0-9]+]]: +; CHECK-NEXT: tbb [pc, {{r[0-9]+}}] + +; CHECK: LJTI0_0: +; CHECK-NEXT: .data_region jt8 +; CHECK-NEXT: .byte (LBB{{[0-9]+_[0-9]+}}-([[PCREL_ANCHOR]]+4))/2 + entry: br label %bb42.i diff --git a/test/CodeGen/X86/asm-reject-reg-type-mismatch.ll b/test/CodeGen/X86/asm-reject-reg-type-mismatch.ll new file mode 100644 index 000000000000..016e2d261eef --- /dev/null +++ b/test/CodeGen/X86/asm-reject-reg-type-mismatch.ll @@ -0,0 +1,10 @@ +; RUN: not llc -no-integrated-as %s -o - 2> %t1 +; RUN: FileCheck %s < %t1 +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64--" + +; CHECK: error: couldn't allocate output register for constraint '{ax}' +define i128 @blup() { + %v = tail call i128 asm "", "={ax},0,~{dirflag},~{fpsr},~{flags}"(i128 0) + ret i128 %v +} diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll index 10ed079a264e..74d20f348b52 100644 --- a/test/CodeGen/X86/avx-vperm2x128.ll +++ b/test/CodeGen/X86/avx-vperm2x128.ll @@ -147,8 +147,8 @@ define <16 x i16> @E5i(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ; AVX1-LABEL: E5i: ; AVX1: ## BB#0: ## %entry ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmovaps (%rsi), %ymm1 +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll index 8aae90c3c03d..5d99269ae1dc 100644 --- a/test/CodeGen/X86/avx2-vector-shifts.ll +++ b/test/CodeGen/X86/avx2-vector-shifts.ll @@ -300,6 +300,56 @@ define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { ret <16 x i16> %shl } +define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { +; CHECK-LABEL: shl_32i8 +; CHECK: vextracti128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpsllw $4, %xmm3, %xmm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; CHECK-NEXT: vpand %xmm8, %xmm2, %xmm5 +; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vpsllw $5, %xmm2, %xmm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm9 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; CHECK-NEXT: vpand %xmm9, %xmm2, %xmm7 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; CHECK-NEXT: vpand %xmm7, %xmm2, %xmm4 +; CHECK-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm4 +; CHECK-NEXT: vpblendvb %xmm4, %xmm5, %xmm3, %xmm3 +; CHECK-NEXT: vpsllw $2, %xmm3, %xmm4 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; CHECK-NEXT: vpand %xmm5, %xmm4, %xmm4 +; CHECK-NEXT: vpaddb %xmm7, %xmm7, %xmm7 +; CHECK-NEXT: vpand %xmm7, %xmm2, %xmm6 +; CHECK-NEXT: vpcmpeqb %xmm2, %xmm6, %xmm6 +; CHECK-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vpaddb %xmm3, %xmm3, %xmm4 +; CHECK-NEXT: vpaddb %xmm7, %xmm7, %xmm6 +; CHECK-NEXT: vpand %xmm6, %xmm2, %xmm6 +; CHECK-NEXT: vpcmpeqb %xmm2, %xmm6, %xmm6 +; CHECK-NEXT: vpblendvb %xmm6, %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vpsllw $4, %xmm0, %xmm4 +; CHECK-NEXT: vpand %xmm8, %xmm4, %xmm4 +; CHECK-NEXT: vpsllw $5, %xmm1, %xmm1 +; CHECK-NEXT: vpand %xmm9, %xmm1, %xmm1 +; CHECK-NEXT: vpand %xmm1, %xmm2, %xmm6 +; CHECK-NEXT: vpcmpeqb %xmm2, %xmm6, %xmm6 +; CHECK-NEXT: vpblendvb %xmm6, %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vpsllw $2, %xmm0, %xmm4 +; CHECK-NEXT: vpand %xmm5, %xmm4, %xmm4 +; CHECK-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpand %xmm1, %xmm2, %xmm5 +; CHECK-NEXT: vpcmpeqb %xmm2, %xmm5, %xmm5 +; CHECK-NEXT: vpblendvb %xmm5, %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vpaddb %xmm0, %xmm0, %xmm4 +; CHECK-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpand %xmm1, %xmm2, %xmm1 +; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; CHECK-NEXT: retq + %shl = shl <32 x i8> %r, %a + ret <32 x i8> %shl +} + define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { ; CHECK-LABEL: ashr_8i16 ; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero @@ -329,6 +379,176 @@ define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { ret <16 x i16> %ashr } +define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { +; CHECK-LABEL: ashr_32i8 +; CHECK: vextracti128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vpextrb $1, %xmm2, %ecx +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpextrb $1, %xmm3, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: vpextrb $0, %xmm2, %ecx +; CHECK-NEXT: vpextrb $0, %xmm3, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: movzbl %dl, %edx +; CHECK-NEXT: vpextrb $2, %xmm2, %ecx +; CHECK-NEXT: vpextrb $2, %xmm3, %esi +; CHECK-NEXT: sarb %cl, %sil +; CHECK-NEXT: vmovd %edx, %xmm4 +; CHECK-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: vpextrb $3, %xmm2, %ecx +; CHECK-NEXT: vpextrb $3, %xmm3, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $4, %xmm2, %ecx +; CHECK-NEXT: vpextrb $4, %xmm3, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $5, %xmm2, %ecx +; CHECK-NEXT: vpextrb $5, %xmm3, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: vpextrb $6, %xmm2, %ecx +; CHECK-NEXT: vpextrb $6, %xmm3, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpextrb $7, %xmm2, %ecx +; CHECK-NEXT: vpextrb $7, %xmm3, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $8, %xmm2, %ecx +; CHECK-NEXT: vpextrb $8, %xmm3, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $9, %xmm2, %ecx +; CHECK-NEXT: vpextrb $9, %xmm3, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: vpextrb $10, %xmm2, %ecx +; CHECK-NEXT: vpextrb $10, %xmm3, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpextrb $11, %xmm2, %ecx +; CHECK-NEXT: vpextrb $11, %xmm3, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $12, %xmm2, %ecx +; CHECK-NEXT: vpextrb $12, %xmm3, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $13, %xmm2, %ecx +; CHECK-NEXT: vpextrb $13, %xmm3, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: vpextrb $14, %xmm2, %ecx +; CHECK-NEXT: vpextrb $14, %xmm3, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $15, %xmm2, %ecx +; CHECK-NEXT: vpextrb $15, %xmm3, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: vpextrb $1, %xmm1, %ecx +; CHECK-NEXT: vpextrb $1, %xmm0, %esi +; CHECK-NEXT: sarb %cl, %sil +; CHECK-NEXT: movzbl %dl, %ecx +; CHECK-NEXT: vpinsrb $14, %ecx, %xmm4, %xmm2 +; CHECK-NEXT: vpextrb $0, %xmm1, %ecx +; CHECK-NEXT: vpextrb $0, %xmm0, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: vpextrb $2, %xmm1, %ecx +; CHECK-NEXT: vpextrb $2, %xmm0, %edi +; CHECK-NEXT: sarb %cl, %dil +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: movzbl %dl, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm3 +; CHECK-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: vpextrb $3, %xmm1, %ecx +; CHECK-NEXT: vpextrb $3, %xmm0, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; CHECK-NEXT: vpextrb $4, %xmm1, %ecx +; CHECK-NEXT: vpextrb $4, %xmm0, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; CHECK-NEXT: vpextrb $5, %xmm1, %ecx +; CHECK-NEXT: vpextrb $5, %xmm0, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: vpextrb $6, %xmm1, %ecx +; CHECK-NEXT: vpextrb $6, %xmm0, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpextrb $7, %xmm1, %ecx +; CHECK-NEXT: vpextrb $7, %xmm0, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; CHECK-NEXT: vpextrb $8, %xmm1, %ecx +; CHECK-NEXT: vpextrb $8, %xmm0, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; CHECK-NEXT: vpextrb $9, %xmm1, %ecx +; CHECK-NEXT: vpextrb $9, %xmm0, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: vpextrb $10, %xmm1, %ecx +; CHECK-NEXT: vpextrb $10, %xmm0, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpextrb $11, %xmm1, %ecx +; CHECK-NEXT: vpextrb $11, %xmm0, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; CHECK-NEXT: vpextrb $12, %xmm1, %ecx +; CHECK-NEXT: vpextrb $12, %xmm0, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; CHECK-NEXT: vpextrb $13, %xmm1, %ecx +; CHECK-NEXT: vpextrb $13, %xmm0, %eax +; CHECK-NEXT: sarb %cl, %al +; CHECK-NEXT: vpextrb $14, %xmm1, %ecx +; CHECK-NEXT: vpextrb $14, %xmm0, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpextrb $15, %xmm1, %ecx +; CHECK-NEXT: vpextrb $15, %xmm0, %edx +; CHECK-NEXT: sarb %cl, %dl +; CHECK-NEXT: vpinsrb $14, %eax, %xmm3, %xmm0 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq + %ashr = ashr <32 x i8> %r, %a + ret <32 x i8> %ashr +} + define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind { ; CHECK-LABEL: lshr_8i16 ; CHECK: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero @@ -357,3 +577,173 @@ define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind { %lshr = lshr <16 x i16> %r, %a ret <16 x i16> %lshr } + +define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind { +; CHECK-LABEL: lshr_32i8 +; CHECK: vextracti128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vpextrb $1, %xmm2, %ecx +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpextrb $1, %xmm3, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: vpextrb $0, %xmm2, %ecx +; CHECK-NEXT: vpextrb $0, %xmm3, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: movzbl %dl, %edx +; CHECK-NEXT: vpextrb $2, %xmm2, %ecx +; CHECK-NEXT: vpextrb $2, %xmm3, %esi +; CHECK-NEXT: shrb %cl, %sil +; CHECK-NEXT: vmovd %edx, %xmm4 +; CHECK-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4 +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: vpextrb $3, %xmm2, %ecx +; CHECK-NEXT: vpextrb $3, %xmm3, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $4, %xmm2, %ecx +; CHECK-NEXT: vpextrb $4, %xmm3, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $5, %xmm2, %ecx +; CHECK-NEXT: vpextrb $5, %xmm3, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: vpextrb $6, %xmm2, %ecx +; CHECK-NEXT: vpextrb $6, %xmm3, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpextrb $7, %xmm2, %ecx +; CHECK-NEXT: vpextrb $7, %xmm3, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $8, %xmm2, %ecx +; CHECK-NEXT: vpextrb $8, %xmm3, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $9, %xmm2, %ecx +; CHECK-NEXT: vpextrb $9, %xmm3, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: vpextrb $10, %xmm2, %ecx +; CHECK-NEXT: vpextrb $10, %xmm3, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpextrb $11, %xmm2, %ecx +; CHECK-NEXT: vpextrb $11, %xmm3, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $12, %xmm2, %ecx +; CHECK-NEXT: vpextrb $12, %xmm3, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $13, %xmm2, %ecx +; CHECK-NEXT: vpextrb $13, %xmm3, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: vpextrb $14, %xmm2, %ecx +; CHECK-NEXT: vpextrb $14, %xmm3, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4 +; CHECK-NEXT: vpextrb $15, %xmm2, %ecx +; CHECK-NEXT: vpextrb $15, %xmm3, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: vpextrb $1, %xmm1, %ecx +; CHECK-NEXT: vpextrb $1, %xmm0, %esi +; CHECK-NEXT: shrb %cl, %sil +; CHECK-NEXT: movzbl %dl, %ecx +; CHECK-NEXT: vpinsrb $14, %ecx, %xmm4, %xmm2 +; CHECK-NEXT: vpextrb $0, %xmm1, %ecx +; CHECK-NEXT: vpextrb $0, %xmm0, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: vpextrb $2, %xmm1, %ecx +; CHECK-NEXT: vpextrb $2, %xmm0, %edi +; CHECK-NEXT: shrb %cl, %dil +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2 +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: movzbl %dl, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm3 +; CHECK-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: vpextrb $3, %xmm1, %ecx +; CHECK-NEXT: vpextrb $3, %xmm0, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3 +; CHECK-NEXT: vpextrb $4, %xmm1, %ecx +; CHECK-NEXT: vpextrb $4, %xmm0, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3 +; CHECK-NEXT: vpextrb $5, %xmm1, %ecx +; CHECK-NEXT: vpextrb $5, %xmm0, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: vpextrb $6, %xmm1, %ecx +; CHECK-NEXT: vpextrb $6, %xmm0, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpextrb $7, %xmm1, %ecx +; CHECK-NEXT: vpextrb $7, %xmm0, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3 +; CHECK-NEXT: vpextrb $8, %xmm1, %ecx +; CHECK-NEXT: vpextrb $8, %xmm0, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3 +; CHECK-NEXT: vpextrb $9, %xmm1, %ecx +; CHECK-NEXT: vpextrb $9, %xmm0, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: vpextrb $10, %xmm1, %ecx +; CHECK-NEXT: vpextrb $10, %xmm0, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpextrb $11, %xmm1, %ecx +; CHECK-NEXT: vpextrb $11, %xmm0, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3 +; CHECK-NEXT: vpextrb $12, %xmm1, %ecx +; CHECK-NEXT: vpextrb $12, %xmm0, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3 +; CHECK-NEXT: vpextrb $13, %xmm1, %ecx +; CHECK-NEXT: vpextrb $13, %xmm0, %eax +; CHECK-NEXT: shrb %cl, %al +; CHECK-NEXT: vpextrb $14, %xmm1, %ecx +; CHECK-NEXT: vpextrb $14, %xmm0, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpextrb $15, %xmm1, %ecx +; CHECK-NEXT: vpextrb $15, %xmm0, %edx +; CHECK-NEXT: shrb %cl, %dl +; CHECK-NEXT: vpinsrb $14, %eax, %xmm3, %xmm0 +; CHECK-NEXT: movzbl %dl, %eax +; CHECK-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: retq + %lshr = lshr <32 x i8> %r, %a + ret <32 x i8> %lshr +} diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll index 8373c6da2619..e70d9f3ad521 100644 --- a/test/CodeGen/X86/avx512-build-vector.ll +++ b/test/CodeGen/X86/avx512-build-vector.ll @@ -2,13 +2,9 @@ define <16 x i32> @test1(i32* %x) { ; CHECK-LABEL: test1: -; CHECK: ## BB#0: -; CHECK-NEXT: vmovd (%rdi), %xmm0 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7] -; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: retq +; CHECK: vmovd (%rdi), %xmm +; CHECK: vmovdqa32 +; CHECK: vpermt2d %zmm %y = load i32, i32* %x, align 4 %res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4 ret <16 x i32>%res diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 471e34cdedce..9387192f8aa4 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -98,18 +98,55 @@ define <4 x float> @test_rcp14_ss(<4 x float> %a0) { declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) { + ; CHECK-LABEL: test_sqrt_pd_512 ; CHECK: vsqrtpd - %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) ; <<8 x double>> [#uses=1] + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) ret <8 x double> %res } -declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone +declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) { + ; CHECK-LABEL: test_sqrt_ps_512 ; CHECK: vsqrtps - %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) ; <<16 x float>> [#uses=1] + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) ret <16 x float> %res } -declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone +define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) { + ; CHECK-LABEL: test_sqrt_round_ps_512 + ; CHECK: vsqrtps {rz-sae} + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 3) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <8 x double> @test_getexp_pd_512(<8 x double> %a0) { + ; CHECK-LABEL: test_getexp_pd_512 + ; CHECK: vgetexppd + %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) + ret <8 x double> %res +} +define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) { + ; CHECK-LABEL: test_getexp_round_pd_512 + ; CHECK: vgetexppd {sae} + %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <16 x float> @test_getexp_ps_512(<16 x float> %a0) { + ; CHECK-LABEL: test_getexp_ps_512 + ; CHECK: vgetexpps + %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) { + ; CHECK-LABEL: test_getexp_round_ps_512 + ; CHECK: vgetexpps {sae} + %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) { ; CHECK: vsqrtss {{.*}}encoding: [0x62 diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll new file mode 100644 index 000000000000..2683d6fe238c --- /dev/null +++ b/test/CodeGen/X86/avx512-shuffle.ll @@ -0,0 +1,336 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK-SKX + +; CHECK-LABEL: test1: +; CHECK: vpermps +; CHECK: ret +define <16 x float> @test1(<16 x float> %a) nounwind { + %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1> + ret <16 x float> %c +} + +; CHECK-LABEL: test2: +; CHECK: vpermd +; CHECK: ret +define <16 x i32> @test2(<16 x i32> %a) nounwind { + %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1> + ret <16 x i32> %c +} + +; CHECK-LABEL: test3: +; CHECK: vpermq +; CHECK: ret +define <8 x i64> @test3(<8 x i64> %a) nounwind { + %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 5, i32 1, i32 undef, i32 7, i32 undef, i32 3, i32 1> + ret <8 x i64> %c +} + +; CHECK-LABEL: test4: +; CHECK: vpermpd +; CHECK: ret +define <8 x double> @test4(<8 x double> %a) nounwind { + %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <8 x double> %c +} + +; CHECK-LABEL: test5: +; CHECK: vpermt2pd +; CHECK: ret +define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind { + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> + ret <8 x double> %c +} + +; CHECK-LABEL: test6: +; CHECK: vpermq $30 +; CHECK: ret +define <8 x i64> @test6(<8 x i64> %a) nounwind { + %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4> + ret <8 x i64> %c +} + +; CHECK-LABEL: test7: +; CHECK: vpermt2q +; CHECK: ret +define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind { + %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5> + ret <8 x i64> %c +} + +; CHECK-LABEL: test8: +; CHECK: vpermt2d +; CHECK: ret +define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind { + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> + ret <16 x i32> %c +} + +; CHECK-LABEL: test9: +; CHECK: vpermt2ps +; CHECK: ret +define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind { + %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> + ret <16 x float> %c +} + +; CHECK-LABEL: test10: +; CHECK: vpermt2ps ( +; CHECK: ret +define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind { + %c = load <16 x float>, <16 x float>* %b + %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> + ret <16 x float> %d +} + +; CHECK-LABEL: test11: +; CHECK: vpermt2d +; CHECK: ret +define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind { + %c = load <16 x i32>, <16 x i32>* %b + %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24> + ret <16 x i32> %d +} + +; CHECK-LABEL: test13 +; CHECK: vpermilps $177, %zmm +; CHECK: ret +define <16 x float> @test13(<16 x float> %a) { + %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> + ret <16 x float> %b +} + +; CHECK-LABEL: test14 +; CHECK: vpermilpd $203, %zmm +; CHECK: ret +define <8 x double> @test14(<8 x double> %a) { + %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7> + ret <8 x double> %b +} + +; CHECK-LABEL: test15 +; CHECK: vpshufd $177, %zmm +; CHECK: ret +define <16 x i32> @test15(<16 x i32> %a) { +; mask 1-0-3-2 = 10110001 = 0xb1 = 177 + %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> + ret <16 x i32> %b +} +; CHECK-LABEL: test16 +; CHECK: valignq $2, %zmm0, %zmm1 +; CHECK: ret +define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind { + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9> + ret <8 x double> %c +} + +; CHECK-LABEL: test17 +; CHECK: vshufpd $19, %zmm1, %zmm0 +; CHECK: ret +define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind { + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef> + ret <8 x double> %c +} + +; CHECK-LABEL: test18 +; CHECK: vpunpckhdq %zmm +; CHECK: ret +define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) { + %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> + ret <16 x i32> %b +} + +; CHECK-LABEL: test19 +; CHECK: vpunpckldq %zmm +; CHECK: ret +define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) { + %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> + ret <16 x i32> %b +} + +; CHECK-LABEL: test20 +; CHECK: vpunpckhqdq %zmm +; CHECK: ret +define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) { + %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> + ret <8 x i64> %b +} + +; CHECK-LABEL: test21 +; CHECK: vbroadcastsd %xmm0, %zmm +; CHECK: ret +define <8 x double> @test21(<8 x double> %a, <8 x double> %b) { + %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x double> %shuffle +} + +; CHECK-LABEL: test22 +; CHECK: vpbroadcastq %xmm0, %zmm +; CHECK: ret +define <8 x i64> @test22(<8 x i64> %a, <8 x i64> %b) { + %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> + ret <8 x i64> %shuffle +} + +; CHECK-LABEL: @test23 +; CHECK: vshufps +; CHECK: vshufps +; CHECK: ret +define <16 x i32> @test23(<16 x i32> %a, <16 x i32> %b) nounwind { + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i32> %c +} + +; CHECK-LABEL: @test24 +; CHECK: vpermt2d +; CHECK: ret +define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind { + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 25, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i32> %c +} + +; CHECK-LABEL: @test25 +; CHECK: vshufps $52 +; CHECK: ret +define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind { +; mask - 0-1-3-0 00110100 = 0x34 = 52 + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 19, i32 16, i32 4, i32 5, i32 23, i32 undef, i32 8, i32 9, i32 27, i32 undef, i32 12, i32 13, i32 undef, i32 undef> + ret <16 x i32> %c +} + +; CHECK-LABEL: @test26 +; CHECK: vmovshdup +; CHECK: ret +define <16 x i32> @test26(<16 x i32> %a) nounwind { + %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef> + ret <16 x i32> %c +} + +; CHECK-LABEL: @test27 +; CHECK: ret +define <16 x i32> @test27(<4 x i32>%a) { + %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> + ret <16 x i32> %res +} + +; CHECK-LABEL: test28 +; CHECK: vpshufhw $177, %ymm +; CHECK: ret +define <16 x i16> @test28(<16 x i16> %a) { + %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32><i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 12, i32 15, i32 14> + ret <16 x i16> %b +} + +; CHECK-LABEL: test29 +; CHECK: vunpcklps %zmm +; CHECK: ret +define <16 x float> @test29(<16 x float> %a, <16 x float> %c) { + %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> + ret <16 x float> %b +} + +; CHECK-LABEL: @test30 +; CHECK: vshufps $144, %zmm +; CHECK: ret +define <16 x float> @test30(<16 x float> %a, <16 x float> %c) { + %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 0, i32 17, i32 18, i32 4, i32 4, i32 21, i32 22, i32 8, i32 8, i32 25, i32 26, i32 12, i32 12, i32 29, i32 30> + ret <16 x float> %b +} + +; CHECK-LABEL: test31 +; CHECK: valignd $3, %zmm0, %zmm1 +; CHECK: ret +define <16 x i32> @test31(<16 x i32> %a, <16 x i32> %b) nounwind { + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18> + ret <16 x i32> %c +} + +; CHECK-LABEL: test32 +; CHECK: vshufpd $99, %zmm0, %zmm1 +; CHECK: ret +define <8 x double> @test32(<8 x double> %a, <8 x double> %b) nounwind { + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 1, i32 10, i32 2, i32 undef, i32 5, i32 15, i32 undef> + ret <8 x double> %c +} + +define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind { +; CHECK-LABEL: test_align_v16i32_rr: +; CHECK: ## BB#0: +; CHECK-NEXT: valignd $3, %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18> + ret <16 x i32> %c +} + +define <16 x i32> @test_align_v16i32_rm(<16 x i32>* %a.ptr, <16 x i32> %b) nounwind { +; CHECK-LABEL: test_align_v16i32_rm: +; CHECK: ## BB#0: +; CHECK-NEXT: valignd $3, (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = load <16 x i32>, <16 x i32>* %a.ptr + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18> + ret <16 x i32> %c +} + +define <16 x i32> @test_align_v16i32_rm_mask(<16 x i32>* %a.ptr, <16 x i32> %b, <16 x i1> %mask) nounwind { +; CHECK-LABEL: test_align_v16i32_rm_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1 +; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1 +; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 +; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1 +; CHECK-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: retq +; +; CHECK-SKX-LABEL: test_align_v16i32_rm_mask: +; CHECK-SKX: ## BB#0: +; CHECK-SKX-NEXT: vpmovb2m %xmm1, %k1 +; CHECK-SKX-NEXT: vmovdqa32 (%rdi), %zmm1 +; CHECK-SKX-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1} +; CHECK-SKX-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-SKX-NEXT: retq + %a = load <16 x i32>, <16 x i32>* %a.ptr + %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18> + %res = select <16 x i1> %mask,<16 x i32> %c, <16 x i32> %a + ret <16 x i32> %res +} + +define <8 x double> @test_align_v8f64_rr(<8 x double> %a, <8 x double> %b) nounwind { +; CHECK-LABEL: test_align_v8f64_rr: +; CHECK: ## BB#0: +; CHECK-NEXT: valignq $3, %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> + ret <8 x double> %c +} + +define <8 x double> @test_align_v18f64_rm(<8 x double>* %a.ptr, <8 x double> %b) nounwind { +; CHECK-LABEL: test_align_v18f64_rm: +; CHECK: ## BB#0: +; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = load <8 x double>, <8 x double>* %a.ptr + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> + ret <8 x double> %c +} + +define <8 x double> @test_align_v18f64_rm_mask(<8 x double>* %a.ptr, <8 x double> %b, <8 x i1> %mask) nounwind { +; CHECK-LABEL: test_align_v18f64_rm_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vpmovsxwq %xmm1, %zmm1 +; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1 +; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-NEXT: retq +; +; CHECK-SKX-LABEL: test_align_v18f64_rm_mask: +; CHECK-SKX: ## BB#0: +; CHECK-SKX-NEXT: vpmovw2m %xmm1, %k1 +; CHECK-SKX-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z} +; CHECK-SKX-NEXT: retq + %a = load <8 x double>, <8 x double>* %a.ptr + %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10> + %res = select <8 x i1> %mask,<8 x double> %c, <8 x double> zeroinitializer + ret <8 x double> %res +} + diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index e1f6276c6ef4..04028a1da510 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -116,11 +116,8 @@ define <2 x double> @test8(<2 x double> %a, <2 x double> %b) { define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind { ; KNL-LABEL: test9: ; KNL: ## BB#0: -; KNL-NEXT: ## kill: YMM1<def> YMM1<kill> ZMM1<def> -; KNL-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<def> ; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 ; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} -; KNL-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<kill> ; KNL-NEXT: retq %mask = icmp eq <8 x i32> %x, %y %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y @@ -130,11 +127,8 @@ define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind { define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind { ; KNL-LABEL: test10: ; KNL: ## BB#0: -; KNL-NEXT: ## kill: YMM1<def> YMM1<kill> ZMM1<def> -; KNL-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<def> ; KNL-NEXT: vcmpeqps %zmm1, %zmm0, %k1 ; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} -; KNL-NEXT: ## kill: YMM0<def> YMM0<kill> ZMM0<kill> ; KNL-NEXT: retq ; SKX-LABEL: test10: ; SKX: ## BB#0: @@ -166,7 +160,6 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind { ; KNL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 ; KNL-NEXT: kunpckbw %k0, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: ## kill: AX<def> AX<kill> EAX<kill> ; KNL-NEXT: retq %res = icmp eq <16 x i64> %a, %b %res1 = bitcast <16 x i1> %res to i16 diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index fd76ed5d0dbd..9d96c272f355 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -668,7 +668,7 @@ declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 ; CHECK-LABEL: compr7 ; CHECK-NOT: vcompress -; CHECK: vmovapd +; CHECK: vmovupd define void @compr7(i8* %addr, <8 x double> %data) { call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1) ret void @@ -757,7 +757,7 @@ declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x ; CHECK-LABEL: expand7 ; CHECK-NOT: vexpand -; CHECK: vmovapd +; CHECK: vmovupd define <8 x double> @expand7(i8* %addr, <8 x double> %data) { %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1) ret <8 x double> %res @@ -2552,4 +2552,38 @@ define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 % %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1) ret <4 x float> %res } -declare <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
\ No newline at end of file +declare <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) + +define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) { + ; CHECK-LABEL: test_sqrt_pd_256 + ; CHECK: vsqrtpd + %res = call <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone + +define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) { + ; CHECK-LABEL: test_sqrt_ps_256 + ; CHECK: vsqrtps + %res = call <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone + +define <4 x double> @test_getexp_pd_256(<4 x double> %a0) { + ; CHECK-LABEL: test_getexp_pd_256 + ; CHECK: vgetexppd + %res = call <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone + +define <8 x float> @test_getexp_ps_256(<8 x float> %a0) { + ; CHECK-LABEL: test_getexp_ps_256 + ; CHECK: vgetexpps + %res = call <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
\ No newline at end of file diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll index 3fb69a48b3c7..73dbe1f650a1 100644 --- a/test/CodeGen/X86/buildvec-insertvec.ll +++ b/test/CodeGen/X86/buildvec-insertvec.ll @@ -1,15 +1,56 @@ -; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind { +; CHECK-LABEL: foo: +; CHECK: # BB#0: +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: movl $255, %eax +; CHECK-NEXT: pinsrd $3, %eax, %xmm0 +; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-NEXT: movd %xmm0, (%rdi) +; CHECK-NEXT: retq %t0 = fptoui <3 x float> %in to <3 x i8> %t1 = shufflevector <3 x i8> %t0, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef> %t2 = insertelement <4 x i8> %t1, i8 -1, i32 3 store <4 x i8> %t2, <4 x i8>* %out, align 4 ret void -; CHECK: foo -; CHECK: cvttps2dq -; CHECK-NOT: pextrd -; CHECK: pinsrd -; CHECK-NEXT: pshufb -; CHECK: ret +} + +; Verify that the DAGCombiner doesn't wrongly fold a build_vector into a +; blend with a zero vector if the build_vector contains negative zero. +; +; TODO: the codegen for function 'test_negative_zero_1' is sub-optimal. +; Ideally, we should generate a single shuffle blend operation. + +define <4 x float> @test_negative_zero_1(<4 x float> %A) { +; CHECK-LABEL: test_negative_zero_1: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: blendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: retq +entry: + %0 = extractelement <4 x float> %A, i32 0 + %1 = insertelement <4 x float> undef, float %0, i32 0 + %2 = insertelement <4 x float> %1, float -0.0, i32 1 + %3 = extractelement <4 x float> %A, i32 2 + %4 = insertelement <4 x float> %2, float %3, i32 2 + %5 = insertelement <4 x float> %4, float 0.0, i32 3 + ret <4 x float> %5 +} + +define <2 x double> @test_negative_zero_2(<2 x double> %A) { +; CHECK-LABEL: test_negative_zero_2: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movhpd {{.*}}(%rip), %xmm0 +; CHECK-NEXT: retq +entry: + %0 = extractelement <2 x double> %A, i32 0 + %1 = insertelement <2 x double> undef, double %0, i32 0 + %2 = insertelement <2 x double> %1, double -0.0, i32 1 + ret <2 x double> %2 } diff --git a/test/CodeGen/X86/critical-anti-dep-breaker.ll b/test/CodeGen/X86/critical-anti-dep-breaker.ll index 86afc1f245ad..de5744d3a88f 100644 --- a/test/CodeGen/X86/critical-anti-dep-breaker.ll +++ b/test/CodeGen/X86/critical-anti-dep-breaker.ll @@ -11,8 +11,7 @@ @NullToken = external global i64 ; CHECK-LABEL: Part_Create: -; CHECK-DAG: # kill: RDI<def> -; CHECK-DAG: movq PartClass@GOTPCREL(%rip), %r10 +; CHECK: movq PartClass@GOTPCREL(%rip), %r10 define i32 @Part_Create(i64* %Anchor, i32 %TypeNum, i32 %F, i32 %Z, i32* %Status, i64* %PartTkn) { %PartObj = alloca i64*, align 8 %Vchunk = alloca i64, align 8 diff --git a/test/CodeGen/X86/machine-cp.ll b/test/CodeGen/X86/machine-cp.ll index 0006b6ea7133..aaed0f0a23dc 100644 --- a/test/CodeGen/X86/machine-cp.ll +++ b/test/CodeGen/X86/machine-cp.ll @@ -58,3 +58,58 @@ while.end: ; preds = %while.body, %entry %t = trunc i64 %a.addr.0.lcssa to i32 ret i32 %t } + +; Check that copy propagation does not kill thing like: +; dst = copy src <-- do not kill that. +; ... = op1 dst<undef> +; ... = op2 dst <-- this is used here. +; +; CHECK-LABEL: foo: +; CHECK: psllw $7, +; CHECK: psllw $7, +; CHECK-NEXT: pand +; CHECK-NEXT: pcmpgtb +; CHECK-NEXT: pand %xmm{{[0-9]+}}, [[SRC:%xmm[0-9]+]] +; Machine propagation used to delete the first copy as the +; first few uses were <undef>. +; CHECK-NEXT: movdqa [[SRC]], [[CPY1:%xmm[0-9]+]] +; CHECK-NEXT: movdqa [[SRC]], [[CPY2:%xmm[0-9]+]] +; CHECK-NEXT: punpckhbw [[SRC]], +; Check that CPY1 is not redefined. +; CHECK-NOT: , [[CPY1]] +; undef use, we do not care. +; CHECK: punpcklwd [[CPY1]], +; Check that CPY1 is not redefined. +; CHECK-NOT: , [[CPY1]] +; CHECK: punpcklbw [[CPY2]], [[CPY2]] +; CHECK-NEXT: punpckhwd [[CPY2]], [[CPY2]] +; CHECK-NEXT pslld $31, [[CPY2]] +; Check that CPY1 is not redefined. +; CHECK-NOT: , [[CPY1]] +; CHECK: punpcklbw [[CPY1]], [[CPY1]] +; CHECK-NEXT: punpcklwd [[CPY1]], [[CPY1]] +; CHECK-NEXT pslld $31, [[CPY1]] +define <16 x float> @foo(<16 x float> %x) { +bb: + %v3 = icmp slt <16 x i32> undef, zeroinitializer + %v14 = zext <16 x i1> %v3 to <16 x i32> + %v16 = fcmp olt <16 x float> %x, zeroinitializer + %v17 = sext <16 x i1> %v16 to <16 x i32> + %v18 = zext <16 x i1> %v16 to <16 x i32> + %v19 = xor <16 x i32> %v14, %v18 + %v20 = or <16 x i32> %v17, undef + %v21 = fptosi <16 x float> %x to <16 x i32> + %v22 = sitofp <16 x i32> %v21 to <16 x float> + %v69 = fcmp ogt <16 x float> %v22, zeroinitializer + %v75 = and <16 x i1> %v69, %v3 + %v77 = bitcast <16 x float> %v22 to <16 x i32> + %v79 = sext <16 x i1> %v75 to <16 x i32> + %v80 = and <16 x i32> undef, %v79 + %v81 = xor <16 x i32> %v77, %v80 + %v82 = and <16 x i32> undef, %v81 + %v83 = xor <16 x i32> %v19, %v82 + %v84 = and <16 x i32> %v83, %v20 + %v85 = xor <16 x i32> %v19, %v84 + %v86 = bitcast <16 x i32> %v85 to <16 x float> + ret <16 x float> %v86 +} diff --git a/test/CodeGen/X86/pic.ll b/test/CodeGen/X86/pic.ll index d543deb804d1..73be234db81c 100644 --- a/test/CodeGen/X86/pic.ll +++ b/test/CodeGen/X86/pic.ll @@ -196,9 +196,11 @@ bb12: ; LINUX-NEXT: .LJTI7_0: ; LINUX: .long .LBB7_2@GOTOFF ; LINUX: .long .LBB7_8@GOTOFF -; LINUX: .long .LBB7_14@GOTOFF -; LINUX: .long .LBB7_9@GOTOFF -; LINUX: .long .LBB7_10@GOTOFF +; LINUX: .long .LBB7_4@GOTOFF +; LINUX: .long .LBB7_6@GOTOFF +; LINUX: .long .LBB7_5@GOTOFF +; LINUX: .long .LBB7_8@GOTOFF +; LINUX: .long .LBB7_7@GOTOFF } declare void @foo1(...) diff --git a/test/CodeGen/X86/pr23603.ll b/test/CodeGen/X86/pr23603.ll new file mode 100644 index 000000000000..6f856aedb8d5 --- /dev/null +++ b/test/CodeGen/X86/pr23603.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s + +declare void @free_v() + +define void @f(i32* %x, i32 %c32, i32* %y) { +; CHECK-LABEL: f + entry: + %v = load i32, i32* %x, !invariant.load !0 +; CHECK: movl (%rdi), %ebx +; CHECK: free_v +; CHECK-NOT: movl (%rdi), %ebx + call void @free_v() + %c = icmp ne i32 %c32, 0 + br i1 %c, label %left, label %merge + + left: + store i32 %v, i32* %y + br label %merge + + merge: + ret void +} + +!0 = !{} diff --git a/test/CodeGen/X86/pr23664.ll b/test/CodeGen/X86/pr23664.ll new file mode 100644 index 000000000000..a501c0db837e --- /dev/null +++ b/test/CodeGen/X86/pr23664.ll @@ -0,0 +1,14 @@ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i2 @f(i32 %arg) { + %trunc = trunc i32 %arg to i1 + %sext = sext i1 %trunc to i2 + %or = or i2 %sext, 1 + ret i2 %or + +; CHECK-LABEL: f: +; CHECK: addb %dil, %dil +; CHECK-NEXT: orb $1, %dil +; CHECK-NEXT: movb %dil, %al +; CHECK-NEXT: retq +} diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll index fcd077092dab..7f1521a83bcf 100644 --- a/test/CodeGen/X86/recip-fastmath.ll +++ b/test/CodeGen/X86/recip-fastmath.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est | FileCheck %s --check-prefix=RECIP -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-recip-est -x86-recip-refinement-steps=2 | FileCheck %s --check-prefix=REFINE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf,vec-divf | FileCheck %s --check-prefix=RECIP +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=divf:2,vec-divf:2 | FileCheck %s --check-prefix=REFINE ; If the target's divss/divps instructions are substantially ; slower than rcpss/rcpps with a Newton-Raphson refinement, diff --git a/test/CodeGen/X86/sibcall-win64.ll b/test/CodeGen/X86/sibcall-win64.ll index f7038726f9ca..204e1f8b050b 100644 --- a/test/CodeGen/X86/sibcall-win64.ll +++ b/test/CodeGen/X86/sibcall-win64.ll @@ -1,7 +1,11 @@ ; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s declare x86_64_win64cc void @win64_callee(i32) +declare x86_64_win64cc void (i32)* @win64_indirect() +declare x86_64_win64cc void @win64_other(i32) declare void @sysv_callee(i32) +declare void (i32)* @sysv_indirect() +declare void @sysv_other(i32) define void @sysv_caller(i32 %p1) { entry: @@ -40,3 +44,23 @@ define x86_64_win64cc void @win64_matched(i32 %p1) { ; CHECK-LABEL: win64_matched: ; CHECK: jmp win64_callee # TAILCALL + +define x86_64_win64cc void @win64_indirect_caller(i32 %p1) { + %1 = call x86_64_win64cc void (i32)* @win64_indirect() + call x86_64_win64cc void @win64_other(i32 0) + tail call x86_64_win64cc void %1(i32 %p1) + ret void +} + +; CHECK-LABEL: win64_indirect_caller: +; CHECK: jmpq *%{{rax|rcx|rdx|r8|r9|r11}} # TAILCALL + +define void @sysv_indirect_caller(i32 %p1) { + %1 = call void (i32)* @sysv_indirect() + call void @sysv_other(i32 0) + tail call void %1(i32 %p1) + ret void +} + +; CHECK-LABEL: sysv_indirect_caller: +; CHECK: jmpq *%{{rax|rcx|rdx|rsi|rdi|r8|r9|r11}} # TAILCALL diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll index 4c6b521156e0..373fa53c970f 100644 --- a/test/CodeGen/X86/sqrt-fastmath.ll +++ b/test/CodeGen/X86/sqrt-fastmath.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,use-sqrt-est | FileCheck %s --check-prefix=ESTIMATE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx -recip=sqrtf,vec-sqrtf | FileCheck %s --check-prefix=ESTIMATE declare double @__sqrt_finite(double) #0 declare float @__sqrtf_finite(float) #0 diff --git a/test/CodeGen/X86/stack-folding-x86_64.ll b/test/CodeGen/X86/stack-folding-x86_64.ll new file mode 100644 index 000000000000..211227916a09 --- /dev/null +++ b/test/CodeGen/X86/stack-folding-x86_64.ll @@ -0,0 +1,51 @@ +; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +; Stack reload folding tests. +; +; By including a nop call with sideeffects we can force a partial register spill of the +; relevant registers and check that the reload is correctly folded into the instruction. + +;TODO stack_fold_bsf_i16 +declare i16 @llvm.cttz.i16(i16, i1) + +define i32 @stack_fold_bsf_i32(i32 %a0) { + ;CHECK-LABEL: stack_fold_bsf_i32 + ;CHECK: bsfl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload + %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + %2 = call i32 @llvm.cttz.i32(i32 %a0, i1 -1) + ret i32 %2 +} +declare i32 @llvm.cttz.i32(i32, i1) + +define i64 @stack_fold_bsf_i64(i64 %a0) { + ;CHECK-LABEL: stack_fold_bsf_i64 + ;CHECK: bsfq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload + %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + %2 = call i64 @llvm.cttz.i64(i64 %a0, i1 -1) + ret i64 %2 +} +declare i64 @llvm.cttz.i64(i64, i1) + +;TODO stack_fold_bsr_i16 +declare i16 @llvm.ctlz.i16(i16, i1) + +define i32 @stack_fold_bsr_i32(i32 %a0) { + ;CHECK-LABEL: stack_fold_bsr_i32 + ;CHECK: bsrl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload + %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + %2 = call i32 @llvm.ctlz.i32(i32 %a0, i1 -1) + ret i32 %2 +} +declare i32 @llvm.ctlz.i32(i32, i1) + +define i64 @stack_fold_bsr_i64(i64 %a0) { + ;CHECK-LABEL: stack_fold_bsr_i64 + ;CHECK: bsrq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload + %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() + %2 = call i64 @llvm.ctlz.i64(i64 %a0, i1 -1) + ret i64 %2 +} +declare i64 @llvm.ctlz.i64(i64, i1) diff --git a/test/CodeGen/X86/statepoint-far-call.ll b/test/CodeGen/X86/statepoint-far-call.ll new file mode 100644 index 000000000000..cd8dd0f35a20 --- /dev/null +++ b/test/CodeGen/X86/statepoint-far-call.ll @@ -0,0 +1,22 @@ +; RUN: llc < %s | FileCheck %s
+; Test to check that Statepoints with X64 far-immediate targets
+; are lowered correctly to an indirect call via a scratch register.
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-win64"
+
+define void @test_far_call() gc "statepoint-example" {
+; CHECK-LABEL: test_far_call
+; CHECK: pushq %rax
+; CHECK: movabsq $140727162896504, %rax
+; CHECK: callq *%rax
+; CHECK: popq %rax
+; CHECK: retq
+
+entry:
+ %safepoint_token = call i32 (i64, i32, void ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void ()* inttoptr (i64 140727162896504 to void ()*), i32 0, i32 0, i32 0, i32 0)
+ ret void
+}
+
+declare i32 @llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)
+
diff --git a/test/CodeGen/X86/switch-or.ll b/test/CodeGen/X86/switch-or.ll index 6e6b013d9fa8..4642accfff8d 100644 --- a/test/CodeGen/X86/switch-or.ll +++ b/test/CodeGen/X86/switch-or.ll @@ -1,10 +1,11 @@ ; RUN: llc -march=x86 -asm-verbose=false < %s | FileCheck %s ; Check that merging switch cases that differ in one bit works. +; CHECK-LABEL: test1 ; CHECK: orl $2 ; CHECK-NEXT: cmpl $6 -define void @foo(i32 %variable) nounwind { +define void @test1(i32 %variable) nounwind { entry: switch i32 %variable, label %if.end [ i32 4, label %if.then @@ -19,4 +20,22 @@ if.end: ret void } +; CHECK-LABEL: test2 +; CHECK: orl $-2147483648 +; CHECK-NEXT: cmpl $-2147483648 +define void @test2(i32 %variable) nounwind { +entry: + switch i32 %variable, label %if.end [ + i32 0, label %if.then + i32 -2147483648, label %if.then + ] + +if.then: + %call = tail call i32 (...) @bar() nounwind + ret void + +if.end: + ret void +} + declare i32 @bar(...) nounwind diff --git a/test/CodeGen/X86/switch.ll b/test/CodeGen/X86/switch.ll index 66a739c8470c..a4dece65479c 100644 --- a/test/CodeGen/X86/switch.ll +++ b/test/CodeGen/X86/switch.ll @@ -534,3 +534,18 @@ return: ret void ; CHECK-NOT: cmpl ; CHECK: cmpl $99 } + + +define void @pr23738(i4 %x) { +entry: + switch i4 %x, label %bb0 [ + i4 0, label %bb1 + i4 1, label %bb1 + i4 -5, label %bb1 + ] +bb0: tail call void @g(i32 0) br label %return +bb1: tail call void @g(i32 1) br label %return +return: ret void +; Don't assert due to truncating the bitwidth (64) to i4 when checking +; that the bit-test range fits in a word. +} diff --git a/test/CodeGen/X86/tail-call-got.ll b/test/CodeGen/X86/tail-call-got.ll index 84d561dcd8c3..20d1a87b626a 100644 --- a/test/CodeGen/X86/tail-call-got.ll +++ b/test/CodeGen/X86/tail-call-got.ll @@ -1,12 +1,14 @@ ; RUN: llc < %s -relocation-model=pic -mattr=+sse2 | FileCheck %s +; We used to do tail calls through the GOT for these symbols, but it was +; disabled due to PR15086. + target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32" target triple = "i386-unknown-freebsd9.0" define double @test1(double %x) nounwind readnone { ; CHECK-LABEL: test1: -; CHECK: movl foo@GOT -; CHECK-NEXT: jmpl +; CHECK: calll foo@PLT %1 = tail call double @foo(double %x) nounwind readnone ret double %1 } @@ -15,10 +17,18 @@ declare double @foo(double) readnone define double @test2(double %x) nounwind readnone { ; CHECK-LABEL: test2: -; CHECK: movl sin@GOT -; CHECK-NEXT: jmpl +; CHECK: calll sin@PLT %1 = tail call double @sin(double %x) nounwind readnone ret double %1 } declare double @sin(double) readnone + +define double @test3(double %x) nounwind readnone { +; CHECK-LABEL: test3: +; CHECK: calll sin2@PLT + %1 = tail call double @sin2(double %x) nounwind readnone + ret double %1 +} + +declare double @sin2(double) readnone diff --git a/test/CodeGen/X86/tailcallpic1.ll b/test/CodeGen/X86/tailcallpic1.ll index ff590a1fd3e9..ed101fcccd2d 100644 --- a/test/CodeGen/X86/tailcallpic1.ll +++ b/test/CodeGen/X86/tailcallpic1.ll @@ -1,5 +1,8 @@ ; RUN: llc < %s -tailcallopt -mtriple=i686-pc-linux-gnu -relocation-model=pic | FileCheck %s +; This test uses guaranteed TCO so these will be tail calls, despite the early +; binding issues. + define protected fastcc i32 @tailcallee(i32 %a1, i32 %a2, i32 %a3, i32 %a4) { entry: ret i32 %a3 diff --git a/test/CodeGen/X86/tailcallpic3.ll b/test/CodeGen/X86/tailcallpic3.ll new file mode 100644 index 000000000000..edc58052d82f --- /dev/null +++ b/test/CodeGen/X86/tailcallpic3.ll @@ -0,0 +1,73 @@ +; RUN: llc < %s -mtriple=i686-pc-linux-gnu -relocation-model=pic | FileCheck %s + +; While many of these could be tail called, we don't do it because it forces +; early binding. + +declare void @external() + +define hidden void @tailcallee_hidden() { +entry: + ret void +} + +define void @tailcall_hidden() { +entry: + tail call void @tailcallee_hidden() + ret void +} +; CHECK: tailcall_hidden: +; CHECK: jmp tailcallee_hidden + +define internal void @tailcallee_internal() { +entry: + ret void +} + +define void @tailcall_internal() { +entry: + tail call void @tailcallee_internal() + ret void +} +; CHECK: tailcall_internal: +; CHECK: jmp tailcallee_internal + +define default void @tailcallee_default() { +entry: + ret void +} + +define void @tailcall_default() { +entry: + tail call void @tailcallee_default() + ret void +} +; CHECK: tailcall_default: +; CHECK: calll tailcallee_default@PLT + +define void @tailcallee_default_implicit() { +entry: + ret void +} + +define void @tailcall_default_implicit() { +entry: + tail call void @tailcallee_default_implicit() + ret void +} +; CHECK: tailcall_default_implicit: +; CHECK: calll tailcallee_default_implicit@PLT + +define void @tailcall_external() { + tail call void @external() + ret void +} +; CHECK: tailcall_external: +; CHECK: calll external@PLT + +define void @musttail_external() { + musttail call void @external() + ret void +} +; CHECK: musttail_external: +; CHECK: movl external@GOT +; CHECK: jmpl diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll index 9f1c7afa295b..3e72212d85d3 100644 --- a/test/CodeGen/X86/vec_fp_to_int.ll +++ b/test/CodeGen/X86/vec_fp_to_int.ll @@ -239,7 +239,6 @@ define <4 x i64> @fptoui_4vf64(<4 x double> %a) { ; SSE2: # BB#0: ; SSE2-NEXT: movapd %xmm0, %xmm2 ; SSE2-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: {{.*#+}} kill: XMM0<def> XMM2<kill> ; SSE2-NEXT: subsd %xmm3, %xmm0 ; SSE2-NEXT: cvttsd2si %xmm0, %rcx ; SSE2-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 @@ -589,7 +588,6 @@ define <8 x i32> @fptoui_8vf32(<8 x float> %a) { ; SSE2-LABEL: fptoui_8vf32: ; SSE2: # BB#0: ; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: {{.*#+}} kill: XMM0<def> XMM2<kill> ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE2-NEXT: cvttss2si %xmm0, %rax ; SSE2-NEXT: movd %eax, %xmm0 diff --git a/test/CodeGen/X86/vec_shift8.ll b/test/CodeGen/X86/vec_shift8.ll new file mode 100644 index 000000000000..a32cb30b0b26 --- /dev/null +++ b/test/CodeGen/X86/vec_shift8.ll @@ -0,0 +1,1016 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX + +; +; Vectorized integer shifts +; + +define <2 x i64> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind readnone ssp { +entry: +; SSE2: pextrw $7, %xmm0, %eax +; SSE2-NEXT: pextrw $7, %xmm1, %ecx +; SSE2-NEXT: shll %cl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: shll %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pextrw $5, %xmm0, %eax +; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: shll %cl, %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: pextrw $1, %xmm0, %eax +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: shll %cl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: shll %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: shll %cl, %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: shll %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: shll %cl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSE41: pextrw $1, %xmm0, %eax +; SSE41-NEXT: pextrw $1, %xmm1, %ecx +; SSE41-NEXT: shll %cl, %eax +; SSE41-NEXT: movd %xmm0, %edx +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: shll %cl, %edx +; SSE41-NEXT: movd %edx, %xmm2 +; SSE41-NEXT: pinsrw $1, %eax, %xmm2 +; SSE41-NEXT: pextrw $2, %xmm0, %eax +; SSE41-NEXT: pextrw $2, %xmm1, %ecx +; SSE41-NEXT: shll %cl, %eax +; SSE41-NEXT: pinsrw $2, %eax, %xmm2 +; SSE41-NEXT: pextrw $3, %xmm0, %eax +; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: shll %cl, %eax +; SSE41-NEXT: pinsrw $3, %eax, %xmm2 +; SSE41-NEXT: pextrw $4, %xmm0, %eax +; SSE41-NEXT: pextrw $4, %xmm1, %ecx +; SSE41-NEXT: shll %cl, %eax +; SSE41-NEXT: pinsrw $4, %eax, %xmm2 +; SSE41-NEXT: pextrw $5, %xmm0, %eax +; SSE41-NEXT: pextrw $5, %xmm1, %ecx +; SSE41-NEXT: shll %cl, %eax +; SSE41-NEXT: pinsrw $5, %eax, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm0, %eax +; SSE41-NEXT: pextrw $6, %xmm1, %ecx +; SSE41-NEXT: shll %cl, %eax +; SSE41-NEXT: pinsrw $6, %eax, %xmm2 +; SSE41-NEXT: pextrw $7, %xmm0, %eax +; SSE41-NEXT: pextrw $7, %xmm1, %ecx +; SSE41-NEXT: shll %cl, %eax +; SSE41-NEXT: pinsrw $7, %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX: vpextrw $1, %xmm0, %eax +; AVX-NEXT: vpextrw $1, %xmm1, %ecx +; AVX-NEXT: shll %cl, %eax +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: vmovd %xmm1, %ecx +; AVX-NEXT: shll %cl, %edx +; AVX-NEXT: vmovd %edx, %xmm2 +; AVX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: vpextrw $2, %xmm1, %ecx +; AVX-NEXT: shll %cl, %eax +; AVX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: vpextrw $3, %xmm1, %ecx +; AVX-NEXT: shll %cl, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $4, %xmm0, %eax +; AVX-NEXT: vpextrw $4, %xmm1, %ecx +; AVX-NEXT: shll %cl, %eax +; AVX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $5, %xmm0, %eax +; AVX-NEXT: vpextrw $5, %xmm1, %ecx +; AVX-NEXT: shll %cl, %eax +; AVX-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $6, %xmm0, %eax +; AVX-NEXT: vpextrw $6, %xmm1, %ecx +; AVX-NEXT: shll %cl, %eax +; AVX-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $7, %xmm0, %eax +; AVX-NEXT: vpextrw $7, %xmm1, %ecx +; AVX-NEXT: shll %cl, %eax +; AVX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 +; AVX-NEXT: retq + %shl = shl <8 x i16> %r, %a + %tmp2 = bitcast <8 x i16> %shl to <2 x i64> + ret <2 x i64> %tmp2 +} + +define <2 x i64> @shl_16i8(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp { +entry: +; SSE2: psllw $5, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pcmpeqb %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: psllw $2, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: paddb %xmm0, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psllw $5, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: paddb %xmm5, %xmm5 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm5, %xmm4 +; SSE41-NEXT: pcmpeqb %xmm3, %xmm4 +; SSE41-NEXT: pand %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqb %xmm3, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm6 +; SSE41-NEXT: psllw $4, %xmm6 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm6 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pblendvb %xmm6, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psllw $2, %xmm1 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: paddb %xmm1, %xmm1 +; SSE41-NEXT: paddb %xmm5, %xmm5 +; SSE41-NEXT: pand %xmm3, %xmm5 +; SSE41-NEXT: pcmpeqb %xmm5, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: pblendvb %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX: vpsllw $5, %xmm1, %xmm1 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX-NEXT: vpand %xmm2, %xmm3, %xmm4 +; AVX-NEXT: vpcmpeqb %xmm3, %xmm4, %xmm4 +; AVX-NEXT: vpand %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpsllw $4, %xmm0, %xmm5 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm5, %xmm5 +; AVX-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $2, %xmm0, %xmm1 +; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %shl = shl <16 x i8> %r, %a + %tmp2 = bitcast <16 x i8> %shl to <2 x i64> + ret <2 x i64> %tmp2 +} + +define <2 x i64> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind readnone ssp { +entry: +; SSE2: pextrw $7, %xmm1, %ecx +; SSE2-NEXT: pextrw $7, %xmm0, %eax +; SSE2-NEXT: sarw %cl, %ax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: sarw %cl, %ax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: pextrw $5, %xmm0, %eax +; SSE2-NEXT: sarw %cl, %ax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: pextrw $1, %xmm0, %eax +; SSE2-NEXT: sarw %cl, %ax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: sarw %cl, %ax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: sarw %cl, %ax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: sarw %cl, %ax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: sarw %cl, %ax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSE41: pextrw $1, %xmm1, %ecx +; SSE41-NEXT: pextrw $1, %xmm0, %eax +; SSE41-NEXT: sarw %cl, %ax +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: movd %xmm0, %edx +; SSE41-NEXT: sarw %cl, %dx +; SSE41-NEXT: movd %edx, %xmm2 +; SSE41-NEXT: pinsrw $1, %eax, %xmm2 +; SSE41-NEXT: pextrw $2, %xmm1, %ecx +; SSE41-NEXT: pextrw $2, %xmm0, %eax +; SSE41-NEXT: sarw %cl, %ax +; SSE41-NEXT: pinsrw $2, %eax, %xmm2 +; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: pextrw $3, %xmm0, %eax +; SSE41-NEXT: sarw %cl, %ax +; SSE41-NEXT: pinsrw $3, %eax, %xmm2 +; SSE41-NEXT: pextrw $4, %xmm1, %ecx +; SSE41-NEXT: pextrw $4, %xmm0, %eax +; SSE41-NEXT: sarw %cl, %ax +; SSE41-NEXT: pinsrw $4, %eax, %xmm2 +; SSE41-NEXT: pextrw $5, %xmm1, %ecx +; SSE41-NEXT: pextrw $5, %xmm0, %eax +; SSE41-NEXT: sarw %cl, %ax +; SSE41-NEXT: pinsrw $5, %eax, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm1, %ecx +; SSE41-NEXT: pextrw $6, %xmm0, %eax +; SSE41-NEXT: sarw %cl, %ax +; SSE41-NEXT: pinsrw $6, %eax, %xmm2 +; SSE41-NEXT: pextrw $7, %xmm1, %ecx +; SSE41-NEXT: pextrw $7, %xmm0, %eax +; SSE41-NEXT: sarw %cl, %ax +; SSE41-NEXT: pinsrw $7, %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX: vpextrw $1, %xmm1, %ecx +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: sarw %cl, %ax +; AVX-NEXT: vmovd %xmm1, %ecx +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: sarw %cl, %dx +; AVX-NEXT: vmovd %edx, %xmm2 +; AVX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $2, %xmm1, %ecx +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: sarw %cl, %ax +; AVX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $3, %xmm1, %ecx +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: sarw %cl, %ax +; AVX-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $4, %xmm1, %ecx +; AVX-NEXT: vpextrw $4, %xmm0, %eax +; AVX-NEXT: sarw %cl, %ax +; AVX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $5, %xmm1, %ecx +; AVX-NEXT: vpextrw $5, %xmm0, %eax +; AVX-NEXT: sarw %cl, %ax +; AVX-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $6, %xmm1, %ecx +; AVX-NEXT: vpextrw $6, %xmm0, %eax +; AVX-NEXT: sarw %cl, %ax +; AVX-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $7, %xmm1, %ecx +; AVX-NEXT: vpextrw $7, %xmm0, %eax +; AVX-NEXT: sarw %cl, %ax +; AVX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 +; AVX-NEXT: retq + %ashr = ashr <8 x i16> %r, %a + %tmp2 = bitcast <8 x i16> %ashr to <2 x i64> + ret <2 x i64> %tmp2 +} + +define <2 x i64> @ashr_16i8(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp { +entry: +; +; SSE2: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movaps %xmm1, -24(%rsp) +; SSE2-NEXT: movaps %xmm0, -40(%rsp) +; SSE2-NEXT: movb -9(%rsp), %cl +; SSE2-NEXT: movb -25(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movb -17(%rsp), %cl +; SSE2-NEXT: movb -33(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movb -13(%rsp), %cl +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movl %eax, -44(%rsp) +; SSE2-NEXT: movb -29(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movzbl %al, %r9d +; SSE2-NEXT: movb -21(%rsp), %cl +; SSE2-NEXT: movb -37(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movb -11(%rsp), %cl +; SSE2-NEXT: movzbl %al, %r10d +; SSE2-NEXT: movb -27(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movb -19(%rsp), %cl +; SSE2-NEXT: movzbl %al, %r11d +; SSE2-NEXT: movb -35(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movb -15(%rsp), %cl +; SSE2-NEXT: movzbl %al, %r14d +; SSE2-NEXT: movb -31(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movzbl %al, %r15d +; SSE2-NEXT: movb -23(%rsp), %cl +; SSE2-NEXT: movb -39(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movb -10(%rsp), %cl +; SSE2-NEXT: movzbl %al, %r12d +; SSE2-NEXT: movb -26(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movb -18(%rsp), %cl +; SSE2-NEXT: movzbl %al, %r13d +; SSE2-NEXT: movb -34(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movb -14(%rsp), %cl +; SSE2-NEXT: movzbl %al, %r8d +; SSE2-NEXT: movb -30(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movb -22(%rsp), %cl +; SSE2-NEXT: movzbl %al, %ebp +; SSE2-NEXT: movb -38(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movb -12(%rsp), %cl +; SSE2-NEXT: movzbl %al, %edi +; SSE2-NEXT: movb -28(%rsp), %dl +; SSE2-NEXT: sarb %cl, %dl +; SSE2-NEXT: movb -20(%rsp), %cl +; SSE2-NEXT: movzbl %dl, %esi +; SSE2-NEXT: movb -36(%rsp), %bl +; SSE2-NEXT: sarb %cl, %bl +; SSE2-NEXT: movb -16(%rsp), %cl +; SSE2-NEXT: movzbl %bl, %ebx +; SSE2-NEXT: movb -32(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movzbl %al, %edx +; SSE2-NEXT: movb -24(%rsp), %cl +; SSE2-NEXT: movb -40(%rsp), %al +; SSE2-NEXT: sarb %cl, %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd -44(%rsp), %xmm1 +; SSE2: movd %r9d, %xmm2 +; SSE2-NEXT: movd %r10d, %xmm3 +; SSE2-NEXT: movd %r11d, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movd %r14d, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: movd %r15d, %xmm1 +; SSE2-NEXT: movd %r12d, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: movd %r13d, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movd %ebp, %xmm0 +; SSE2-NEXT: movd %edi, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: movd %ebx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSE41: pextrb $1, %xmm1, %ecx +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pextrb $0, %xmm1, %ecx +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: sarb %cl, %dl +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: movd %ecx, %xmm2 +; SSE41-NEXT: pinsrb $1, %eax, %xmm2 +; SSE41-NEXT: pextrb $2, %xmm1, %ecx +; SSE41-NEXT: pextrb $2, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm2 +; SSE41-NEXT: pextrb $3, %xmm1, %ecx +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm2 +; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: pextrb $4, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm2 +; SSE41-NEXT: pextrb $5, %xmm1, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm2 +; SSE41-NEXT: pextrb $6, %xmm1, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm2 +; SSE41-NEXT: pextrb $7, %xmm1, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm2 +; SSE41-NEXT: pextrb $8, %xmm1, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm2 +; SSE41-NEXT: pextrb $9, %xmm1, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm2 +; SSE41-NEXT: pextrb $10, %xmm1, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm2 +; SSE41-NEXT: pextrb $11, %xmm1, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm2 +; SSE41-NEXT: pextrb $12, %xmm1, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm2 +; SSE41-NEXT: pextrb $13, %xmm1, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $13, %eax, %xmm2 +; SSE41-NEXT: pextrb $14, %xmm1, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm2 +; SSE41-NEXT: pextrb $15, %xmm1, %ecx +; SSE41-NEXT: pextrb $15, %xmm0, %eax +; SSE41-NEXT: sarb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX: vpextrb $1, %xmm1, %ecx +; AVX-NEXT: vpextrb $1, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpextrb $0, %xmm1, %ecx +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: sarb %cl, %dl +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vmovd %ecx, %xmm2 +; AVX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $2, %xmm1, %ecx +; AVX-NEXT: vpextrb $2, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $3, %xmm1, %ecx +; AVX-NEXT: vpextrb $3, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $4, %xmm1, %ecx +; AVX-NEXT: vpextrb $4, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $5, %xmm1, %ecx +; AVX-NEXT: vpextrb $5, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $6, %xmm1, %ecx +; AVX-NEXT: vpextrb $6, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $7, %xmm1, %ecx +; AVX-NEXT: vpextrb $7, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $8, %xmm1, %ecx +; AVX-NEXT: vpextrb $8, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $9, %xmm1, %ecx +; AVX-NEXT: vpextrb $9, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $10, %xmm1, %ecx +; AVX-NEXT: vpextrb $10, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $11, %xmm1, %ecx +; AVX-NEXT: vpextrb $11, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $12, %xmm1, %ecx +; AVX-NEXT: vpextrb $12, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $13, %xmm1, %ecx +; AVX-NEXT: vpextrb $13, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $14, %xmm1, %ecx +; AVX-NEXT: vpextrb $14, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $15, %xmm1, %ecx +; AVX-NEXT: vpextrb $15, %xmm0, %eax +; AVX-NEXT: sarb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: retq + %ashr = ashr <16 x i8> %r, %a + %tmp2 = bitcast <16 x i8> %ashr to <2 x i64> + ret <2 x i64> %tmp2 +} + +define <2 x i64> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind readnone ssp { +entry: + +; SSE2: pextrw $7, %xmm0, %eax +; SSE2-NEXT: pextrw $7, %xmm1, %ecx +; SSE2-NEXT: shrl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: pextrw $3, %xmm1, %ecx +; SSE2-NEXT: shrl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pextrw $5, %xmm0, %eax +; SSE2-NEXT: pextrw $5, %xmm1, %ecx +; SSE2-NEXT: shrl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: pextrw $1, %xmm0, %eax +; SSE2-NEXT: pextrw $1, %xmm1, %ecx +; SSE2-NEXT: shrl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: pextrw $6, %xmm1, %ecx +; SSE2-NEXT: shrl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: pextrw $2, %xmm1, %ecx +; SSE2-NEXT: shrl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: pextrw $4, %xmm1, %ecx +; SSE2-NEXT: shrl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: shrl %cl, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: retq +; +; SSE41: pextrw $1, %xmm0, %eax +; SSE41-NEXT: pextrw $1, %xmm1, %ecx +; SSE41-NEXT: shrl %cl, %eax +; SSE41-NEXT: movd %xmm1, %ecx +; SSE41-NEXT: movd %xmm0, %edx +; SSE41-NEXT: movzwl %dx, %edx +; SSE41-NEXT: shrl %cl, %edx +; SSE41-NEXT: movd %edx, %xmm2 +; SSE41-NEXT: pinsrw $1, %eax, %xmm2 +; SSE41-NEXT: pextrw $2, %xmm0, %eax +; SSE41-NEXT: pextrw $2, %xmm1, %ecx +; SSE41-NEXT: shrl %cl, %eax +; SSE41-NEXT: pinsrw $2, %eax, %xmm2 +; SSE41-NEXT: pextrw $3, %xmm0, %eax +; SSE41-NEXT: pextrw $3, %xmm1, %ecx +; SSE41-NEXT: shrl %cl, %eax +; SSE41-NEXT: pinsrw $3, %eax, %xmm2 +; SSE41-NEXT: pextrw $4, %xmm0, %eax +; SSE41-NEXT: pextrw $4, %xmm1, %ecx +; SSE41-NEXT: shrl %cl, %eax +; SSE41-NEXT: pinsrw $4, %eax, %xmm2 +; SSE41-NEXT: pextrw $5, %xmm0, %eax +; SSE41-NEXT: pextrw $5, %xmm1, %ecx +; SSE41-NEXT: shrl %cl, %eax +; SSE41-NEXT: pinsrw $5, %eax, %xmm2 +; SSE41-NEXT: pextrw $6, %xmm0, %eax +; SSE41-NEXT: pextrw $6, %xmm1, %ecx +; SSE41-NEXT: shrl %cl, %eax +; SSE41-NEXT: pinsrw $6, %eax, %xmm2 +; SSE41-NEXT: pextrw $7, %xmm0, %eax +; SSE41-NEXT: pextrw $7, %xmm1, %ecx +; SSE41-NEXT: shrl %cl, %eax +; SSE41-NEXT: pinsrw $7, %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX: vpextrw $1, %xmm0, %eax +; AVX-NEXT: vpextrw $1, %xmm1, %ecx +; AVX-NEXT: shrl %cl, %eax +; AVX-NEXT: vmovd %xmm1, %ecx +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: shrl %cl, %edx +; AVX-NEXT: vmovd %edx, %xmm2 +; AVX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: vpextrw $2, %xmm1, %ecx +; AVX-NEXT: shrl %cl, %eax +; AVX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: vpextrw $3, %xmm1, %ecx +; AVX-NEXT: shrl %cl, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $4, %xmm0, %eax +; AVX-NEXT: vpextrw $4, %xmm1, %ecx +; AVX-NEXT: shrl %cl, %eax +; AVX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $5, %xmm0, %eax +; AVX-NEXT: vpextrw $5, %xmm1, %ecx +; AVX-NEXT: shrl %cl, %eax +; AVX-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $6, %xmm0, %eax +; AVX-NEXT: vpextrw $6, %xmm1, %ecx +; AVX-NEXT: shrl %cl, %eax +; AVX-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrw $7, %xmm0, %eax +; AVX-NEXT: vpextrw $7, %xmm1, %ecx +; AVX-NEXT: shrl %cl, %eax +; AVX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 +; AVX-NEXT: retq + %lshr = lshr <8 x i16> %r, %a + %tmp2 = bitcast <8 x i16> %lshr to <2 x i64> + ret <2 x i64> %tmp2 +} + +define <2 x i64> @lshr_16i8(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp { +entry: +; SSE2: pushq %rbp +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movaps %xmm1, -24(%rsp) +; SSE2-NEXT: movaps %xmm0, -40(%rsp) +; SSE2-NEXT: movb -9(%rsp), %cl +; SSE2-NEXT: movb -25(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movb -17(%rsp), %cl +; SSE2-NEXT: movb -33(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movb -13(%rsp), %cl +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movl %eax, -44(%rsp) +; SSE2-NEXT: movb -29(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movzbl %al, %r9d +; SSE2-NEXT: movb -21(%rsp), %cl +; SSE2-NEXT: movb -37(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movb -11(%rsp), %cl +; SSE2-NEXT: movzbl %al, %r10d +; SSE2-NEXT: movb -27(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movb -19(%rsp), %cl +; SSE2-NEXT: movzbl %al, %r11d +; SSE2-NEXT: movb -35(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movb -15(%rsp), %cl +; SSE2-NEXT: movzbl %al, %r14d +; SSE2-NEXT: movb -31(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movzbl %al, %r15d +; SSE2-NEXT: movb -23(%rsp), %cl +; SSE2-NEXT: movb -39(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movb -10(%rsp), %cl +; SSE2-NEXT: movzbl %al, %r12d +; SSE2-NEXT: movb -26(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movb -18(%rsp), %cl +; SSE2-NEXT: movzbl %al, %r13d +; SSE2-NEXT: movb -34(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movb -14(%rsp), %cl +; SSE2-NEXT: movzbl %al, %r8d +; SSE2-NEXT: movb -30(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movb -22(%rsp), %cl +; SSE2-NEXT: movzbl %al, %ebp +; SSE2-NEXT: movb -38(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movb -12(%rsp), %cl +; SSE2-NEXT: movzbl %al, %edi +; SSE2-NEXT: movb -28(%rsp), %dl +; SSE2-NEXT: shrb %cl, %dl +; SSE2-NEXT: movb -20(%rsp), %cl +; SSE2-NEXT: movzbl %dl, %esi +; SSE2-NEXT: movb -36(%rsp), %bl +; SSE2-NEXT: shrb %cl, %bl +; SSE2-NEXT: movb -16(%rsp), %cl +; SSE2-NEXT: movzbl %bl, %ebx +; SSE2-NEXT: movb -32(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movzbl %al, %edx +; SSE2-NEXT: movb -24(%rsp), %cl +; SSE2-NEXT: movb -40(%rsp), %al +; SSE2-NEXT: shrb %cl, %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: movd -44(%rsp), %xmm1 +; SSE2: movd %r9d, %xmm2 +; SSE2-NEXT: movd %r10d, %xmm3 +; SSE2-NEXT: movd %r11d, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movd %r14d, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: movd %r15d, %xmm1 +; SSE2-NEXT: movd %r12d, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: movd %r13d, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movd %ebp, %xmm0 +; SSE2-NEXT: movd %edi, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: movd %ebx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSE41: pextrb $1, %xmm1, %ecx +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pextrb $0, %xmm1, %ecx +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: shrb %cl, %dl +; SSE41-NEXT: movzbl %dl, %ecx +; SSE41-NEXT: movd %ecx, %xmm2 +; SSE41-NEXT: pinsrb $1, %eax, %xmm2 +; SSE41-NEXT: pextrb $2, %xmm1, %ecx +; SSE41-NEXT: pextrb $2, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm2 +; SSE41-NEXT: pextrb $3, %xmm1, %ecx +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm2 +; SSE41-NEXT: pextrb $4, %xmm1, %ecx +; SSE41-NEXT: pextrb $4, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm2 +; SSE41-NEXT: pextrb $5, %xmm1, %ecx +; SSE41-NEXT: pextrb $5, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm2 +; SSE41-NEXT: pextrb $6, %xmm1, %ecx +; SSE41-NEXT: pextrb $6, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm2 +; SSE41-NEXT: pextrb $7, %xmm1, %ecx +; SSE41-NEXT: pextrb $7, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm2 +; SSE41-NEXT: pextrb $8, %xmm1, %ecx +; SSE41-NEXT: pextrb $8, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm2 +; SSE41-NEXT: pextrb $9, %xmm1, %ecx +; SSE41-NEXT: pextrb $9, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm2 +; SSE41-NEXT: pextrb $10, %xmm1, %ecx +; SSE41-NEXT: pextrb $10, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm2 +; SSE41-NEXT: pextrb $11, %xmm1, %ecx +; SSE41-NEXT: pextrb $11, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm2 +; SSE41-NEXT: pextrb $12, %xmm1, %ecx +; SSE41-NEXT: pextrb $12, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm2 +; SSE41-NEXT: pextrb $13, %xmm1, %ecx +; SSE41-NEXT: pextrb $13, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $13, %eax, %xmm2 +; SSE41-NEXT: pextrb $14, %xmm1, %ecx +; SSE41-NEXT: pextrb $14, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm2 +; SSE41-NEXT: pextrb $15, %xmm1, %ecx +; SSE41-NEXT: pextrb $15, %xmm0, %eax +; SSE41-NEXT: shrb %cl, %al +; SSE41-NEXT: movzbl %al, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX: vpextrb $1, %xmm1, %ecx +; AVX-NEXT: vpextrb $1, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpextrb $0, %xmm1, %ecx +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: shrb %cl, %dl +; AVX-NEXT: movzbl %dl, %ecx +; AVX-NEXT: vmovd %ecx, %xmm2 +; AVX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $2, %xmm1, %ecx +; AVX-NEXT: vpextrb $2, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $3, %xmm1, %ecx +; AVX-NEXT: vpextrb $3, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $4, %xmm1, %ecx +; AVX-NEXT: vpextrb $4, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $5, %xmm1, %ecx +; AVX-NEXT: vpextrb $5, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $6, %xmm1, %ecx +; AVX-NEXT: vpextrb $6, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $7, %xmm1, %ecx +; AVX-NEXT: vpextrb $7, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $8, %xmm1, %ecx +; AVX-NEXT: vpextrb $8, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $9, %xmm1, %ecx +; AVX-NEXT: vpextrb $9, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $10, %xmm1, %ecx +; AVX-NEXT: vpextrb $10, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $11, %xmm1, %ecx +; AVX-NEXT: vpextrb $11, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $12, %xmm1, %ecx +; AVX-NEXT: vpextrb $12, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $13, %xmm1, %ecx +; AVX-NEXT: vpextrb $13, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $14, %xmm1, %ecx +; AVX-NEXT: vpextrb $14, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX-NEXT: vpextrb $15, %xmm1, %ecx +; AVX-NEXT: vpextrb $15, %xmm0, %eax +; AVX-NEXT: shrb %cl, %al +; AVX-NEXT: movzbl %al, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX-NEXT: retq + %lshr = lshr <16 x i8> %r, %a + %tmp2 = bitcast <16 x i8> %lshr to <2 x i64> + ret <2 x i64> %tmp2 +} diff --git a/test/CodeGen/X86/vector-ctpop.ll b/test/CodeGen/X86/vector-ctpop.ll deleted file mode 100644 index 59d67928c6fa..000000000000 --- a/test/CodeGen/X86/vector-ctpop.ll +++ /dev/null @@ -1,159 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck -check-prefix=AVX2 %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx -mattr=-popcnt | FileCheck -check-prefix=AVX1-NOPOPCNT %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -mattr=-popcnt | FileCheck -check-prefix=AVX2-NOPOPCNT %s - -; Vector version of: -; v = v - ((v >> 1) & 0x55555555) -; v = (v & 0x33333333) + ((v >> 2) & 0x33333333) -; v = (v + (v >> 4) & 0xF0F0F0F) -; v = v + (v >> 8) -; v = v + (v >> 16) -; v = v + (v >> 32) ; i64 only - -define <8 x i32> @test0(<8 x i32> %x) { -; AVX2-LABEL: @test0 -entry: -; AVX2: vpsrld $1, %ymm -; AVX2-NEXT: vpbroadcastd -; AVX2-NEXT: vpand -; AVX2-NEXT: vpsubd -; AVX2-NEXT: vpbroadcastd -; AVX2-NEXT: vpand -; AVX2-NEXT: vpsrld $2 -; AVX2-NEXT: vpand -; AVX2-NEXT: vpaddd -; AVX2-NEXT: vpsrld $4 -; AVX2-NEXT: vpaddd -; AVX2-NEXT: vpbroadcastd -; AVX2-NEXT: vpand -; AVX2-NEXT: vpsrld $8 -; AVX2-NEXT: vpaddd -; AVX2-NEXT: vpsrld $16 -; AVX2-NEXT: vpaddd -; AVX2-NEXT: vpbroadcastd -; AVX2-NEXT: vpand - %y = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %x) - ret <8 x i32> %y -} - -define <4 x i64> @test1(<4 x i64> %x) { -; AVX2-NOPOPCNT-LABEL: @test1 -entry: -; AVX2-NOPOPCNT: vpsrlq $1, %ymm -; AVX2-NOPOPCNT-NEXT: vpbroadcastq -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsubq -; AVX2-NOPOPCNT-NEXT: vpbroadcastq -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsrlq $2 -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpsrlq $4 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpbroadcastq -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsrlq $8 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpsrlq $16 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpsrlq $32 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpbroadcastq -; AVX2-NOPOPCNT-NEXT: vpand - %y = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %x) - ret <4 x i64> %y -} - -define <4 x i32> @test2(<4 x i32> %x) { -; AVX2-NOPOPCNT-LABEL: @test2 -; AVX1-NOPOPCNT-LABEL: @test2 -entry: -; AVX2-NOPOPCNT: vpsrld $1, %xmm -; AVX2-NOPOPCNT-NEXT: vpbroadcastd -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsubd -; AVX2-NOPOPCNT-NEXT: vpbroadcastd -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsrld $2 -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpaddd -; AVX2-NOPOPCNT-NEXT: vpsrld $4 -; AVX2-NOPOPCNT-NEXT: vpaddd -; AVX2-NOPOPCNT-NEXT: vpbroadcastd -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsrld $8 -; AVX2-NOPOPCNT-NEXT: vpaddd -; AVX2-NOPOPCNT-NEXT: vpsrld $16 -; AVX2-NOPOPCNT-NEXT: vpaddd -; AVX2-NOPOPCNT-NEXT: vpbroadcastd -; AVX2-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT: vpsrld $1, %xmm -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpsubd -; AVX1-NOPOPCNT-NEXT: vmovdqa -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpsrld $2 -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpaddd -; AVX1-NOPOPCNT-NEXT: vpsrld $4 -; AVX1-NOPOPCNT-NEXT: vpaddd -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpsrld $8 -; AVX1-NOPOPCNT-NEXT: vpaddd -; AVX1-NOPOPCNT-NEXT: vpsrld $16 -; AVX1-NOPOPCNT-NEXT: vpaddd -; AVX1-NOPOPCNT-NEXT: vpand - %y = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %x) - ret <4 x i32> %y -} - -define <2 x i64> @test3(<2 x i64> %x) { -; AVX2-NOPOPCNT-LABEL: @test3 -; AVX1-NOPOPCNT-LABEL: @test3 -entry: -; AVX2-NOPOPCNT: vpsrlq $1, %xmm -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsubq -; AVX2-NOPOPCNT-NEXT: vmovdqa -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsrlq $2 -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpsrlq $4 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpand -; AVX2-NOPOPCNT-NEXT: vpsrlq $8 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpsrlq $16 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpsrlq $32 -; AVX2-NOPOPCNT-NEXT: vpaddq -; AVX2-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT: vpsrlq $1, %xmm -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpsubq -; AVX1-NOPOPCNT-NEXT: vmovdqa -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpsrlq $2 -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpaddq -; AVX1-NOPOPCNT-NEXT: vpsrlq $4 -; AVX1-NOPOPCNT-NEXT: vpaddq -; AVX1-NOPOPCNT-NEXT: vpand -; AVX1-NOPOPCNT-NEXT: vpsrlq $8 -; AVX1-NOPOPCNT-NEXT: vpaddq -; AVX1-NOPOPCNT-NEXT: vpsrlq $16 -; AVX1-NOPOPCNT-NEXT: vpaddq -; AVX1-NOPOPCNT-NEXT: vpsrlq $32 -; AVX1-NOPOPCNT-NEXT: vpaddq -; AVX1-NOPOPCNT-NEXT: vpand - %y = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %x) - ret <2 x i64> %y -} - -declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) -declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) - -declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) -declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) - diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll new file mode 100644 index 000000000000..b43188b7c6ea --- /dev/null +++ b/test/CodeGen/X86/vector-lzcnt-128.ll @@ -0,0 +1,1915 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <2 x i64> @testv2i64(<2 x i64> %in) { +; SSE2-LABEL: testv2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: bsrq %rax, %rax +; SSE2-NEXT: movl $127, %ecx +; SSE2-NEXT: cmoveq %rcx, %rax +; SSE2-NEXT: xorq $63, %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd $78, %xmm0, %xmm0 # xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: bsrq %rax, %rax +; SSE2-NEXT: cmoveq %rcx, %rax +; SSE2-NEXT: xorq $63, %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv2i64: +; SSE3: # BB#0: +; SSE3-NEXT: movd %xmm0, %rax +; SSE3-NEXT: bsrq %rax, %rax +; SSE3-NEXT: movl $127, %ecx +; SSE3-NEXT: cmoveq %rcx, %rax +; SSE3-NEXT: xorq $63, %rax +; SSE3-NEXT: movd %rax, %xmm1 +; SSE3-NEXT: pshufd $78, %xmm0, %xmm0 # xmm0 = xmm0[2,3,0,1] +; SSE3-NEXT: movd %xmm0, %rax +; SSE3-NEXT: bsrq %rax, %rax +; SSE3-NEXT: cmoveq %rcx, %rax +; SSE3-NEXT: xorq $63, %rax +; SSE3-NEXT: movd %rax, %xmm0 +; SSE3-NEXT: punpcklqdq %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0] +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv2i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: bsrq %rax, %rax +; SSSE3-NEXT: movl $127, %ecx +; SSSE3-NEXT: cmoveq %rcx, %rax +; SSSE3-NEXT: xorq $63, %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd $78, %xmm0, %xmm0 # xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: bsrq %rax, %rax +; SSSE3-NEXT: cmoveq %rcx, %rax +; SSSE3-NEXT: xorq $63, %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq %xmm0, %xmm1 # xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq + +; +; SSE41-LABEL: testv2i64: +; SSE41: # BB#0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: bsrq %rax, %rax +; SSE41-NEXT: movl $127, %ecx +; SSE41-NEXT: cmoveq %rcx, %rax +; SSE41-NEXT: xorq $63, %rax +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: bsrq %rax, %rax +; SSE41-NEXT: cmoveq %rcx, %rax +; SSE41-NEXT: xorq $63, %rax +; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: testv2i64: +; AVX: # BB#0: +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: bsrq %rax, %rax +; AVX-NEXT: movl $127, %ecx +; AVX-NEXT: cmoveq %rcx, %rax +; AVX-NEXT: xorq $63, %rax +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: bsrq %rax, %rax +; AVX-NEXT: cmoveq %rcx, %rax +; AVX-NEXT: xorq $63, %rax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0) + ret <2 x i64> %out +} + +define <2 x i64> @testv2i64u(<2 x i64> %in) { +; SSE2-LABEL: testv2i64u: +; SSE2: # BB#0: +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: bsrq %rax, %rax +; SSE2-NEXT: xorq $63, %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: bsrq %rax, %rax +; SSE2-NEXT: xorq $63, %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv2i64u: +; SSE3: # BB#0: +; SSE3-NEXT: movd %xmm0, %rax +; SSE3-NEXT: bsrq %rax, %rax +; SSE3-NEXT: xorq $63, %rax +; SSE3-NEXT: movd %rax, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE3-NEXT: movd %xmm0, %rax +; SSE3-NEXT: bsrq %rax, %rax +; SSE3-NEXT: xorq $63, %rax +; SSE3-NEXT: movd %rax, %xmm0 +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv2i64u: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: bsrq %rax, %rax +; SSSE3-NEXT: xorq $63, %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: bsrq %rax, %rax +; SSSE3-NEXT: xorq $63, %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv2i64u: +; SSE41: # BB#0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: bsrq %rax, %rax +; SSE41-NEXT: xorq $63, %rax +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: bsrq %rax, %rax +; SSE41-NEXT: xorq $63, %rax +; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: testv2i64u: +; AVX: # BB#0: +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: bsrq %rax, %rax +; AVX-NEXT: xorq $63, %rax +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: bsrq %rax, %rax +; AVX-NEXT: xorq $63, %rax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 -1) + ret <2 x i64> %out +} + +define <4 x i32> @testv4i32(<4 x i32> %in) { +; SSE2-LABEL: testv4i32: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: bsrl %eax, %eax +; SSE2-NEXT: movl $63, %ecx +; SSE2-NEXT: cmovel %ecx, %eax +; SSE2-NEXT: xorl $31, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: bsrl %eax, %eax +; SSE2-NEXT: cmovel %ecx, %eax +; SSE2-NEXT: xorl $31, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: bsrl %eax, %eax +; SSE2-NEXT: cmovel %ecx, %eax +; SSE2-NEXT: xorl $31, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: bsrl %eax, %eax +; SSE2-NEXT: cmovel %ecx, %eax +; SSE2-NEXT: xorl $31, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv4i32: +; SSE3: # BB#0: +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSE3-NEXT: movd %xmm1, %eax +; SSE3-NEXT: bsrl %eax, %eax +; SSE3-NEXT: movl $63, %ecx +; SSE3-NEXT: cmovel %ecx, %eax +; SSE3-NEXT: xorl $31, %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE3-NEXT: movd %xmm2, %eax +; SSE3-NEXT: bsrl %eax, %eax +; SSE3-NEXT: cmovel %ecx, %eax +; SSE3-NEXT: xorl $31, %eax +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: bsrl %eax, %eax +; SSE3-NEXT: cmovel %ecx, %eax +; SSE3-NEXT: xorl $31, %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: bsrl %eax, %eax +; SSE3-NEXT: cmovel %ecx, %eax +; SSE3-NEXT: xorl $31, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv4i32: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: bsrl %eax, %eax +; SSSE3-NEXT: movl $63, %ecx +; SSSE3-NEXT: cmovel %ecx, %eax +; SSSE3-NEXT: xorl $31, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: bsrl %eax, %eax +; SSSE3-NEXT: cmovel %ecx, %eax +; SSSE3-NEXT: xorl $31, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: bsrl %eax, %eax +; SSSE3-NEXT: cmovel %ecx, %eax +; SSSE3-NEXT: xorl $31, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: bsrl %eax, %eax +; SSSE3-NEXT: cmovel %ecx, %eax +; SSSE3-NEXT: xorl $31, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv4i32: +; SSE41: # BB#0: +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: movl $63, %ecx +; SSE41-NEXT: cmovel %ecx, %eax +; SSE41-NEXT: xorl $31, %eax +; SSE41-NEXT: movd %xmm0, %edx +; SSE41-NEXT: bsrl %edx, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: xorl $31, %edx +; SSE41-NEXT: movd %edx, %xmm1 +; SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; SSE41-NEXT: pextrd $2, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: cmovel %ecx, %eax +; SSE41-NEXT: xorl $31, %eax +; SSE41-NEXT: pinsrd $2, %eax, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: cmovel %ecx, %eax +; SSE41-NEXT: xorl $31, %eax +; SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv4i32: +; AVX: # BB#0: +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: movl $63, %ecx +; AVX-NEXT: cmovel %ecx, %eax +; AVX-NEXT: xorl $31, %eax +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: bsrl %edx, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: xorl $31, %edx +; AVX-NEXT: vmovd %edx, %xmm1 +; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: cmovel %ecx, %eax +; AVX-NEXT: xorl $31, %eax +; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $3, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: cmovel %ecx, %eax +; AVX-NEXT: xorl $31, %eax +; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 0) + ret <4 x i32> %out +} + +define <4 x i32> @testv4i32u(<4 x i32> %in) { +; SSE2-LABEL: testv4i32u: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: bsrl %eax, %eax +; SSE2-NEXT: xorl $31, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: bsrl %eax, %eax +; SSE2-NEXT: xorl $31, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: bsrl %eax, %eax +; SSE2-NEXT: xorl $31, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: bsrl %eax, %eax +; SSE2-NEXT: xorl $31, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv4i32u: +; SSE3: # BB#0: +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSE3-NEXT: movd %xmm1, %eax +; SSE3-NEXT: bsrl %eax, %eax +; SSE3-NEXT: xorl $31, %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE3-NEXT: movd %xmm2, %eax +; SSE3-NEXT: bsrl %eax, %eax +; SSE3-NEXT: xorl $31, %eax +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: bsrl %eax, %eax +; SSE3-NEXT: xorl $31, %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: bsrl %eax, %eax +; SSE3-NEXT: xorl $31, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv4i32u: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: bsrl %eax, %eax +; SSSE3-NEXT: xorl $31, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: bsrl %eax, %eax +; SSSE3-NEXT: xorl $31, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: bsrl %eax, %eax +; SSSE3-NEXT: xorl $31, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: bsrl %eax, %eax +; SSSE3-NEXT: xorl $31, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv4i32u: +; SSE41: # BB#0: +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $31, %eax +; SSE41-NEXT: movd %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: xorl $31, %ecx +; SSE41-NEXT: movd %ecx, %xmm1 +; SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; SSE41-NEXT: pextrd $2, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $31, %eax +; SSE41-NEXT: pinsrd $2, %eax, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $31, %eax +; SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv4i32u: +; AVX: # BB#0: +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $31, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: xorl $31, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $31, %eax +; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $3, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $31, %eax +; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %in, i1 -1) + ret <4 x i32> %out +} + +define <8 x i16> @testv8i16(<8 x i16> %in) { +; SSE2-LABEL: testv8i16: +; SSE2: # BB#0: +; SSE2-NEXT: pextrw $7, %xmm0, %eax +; SSE2-NEXT: bsrw %ax, %cx +; SSE2-NEXT: movw $31, %ax +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: bsrw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: bsrw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: bsrw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: bsrw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: bsrw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: bsrw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: bsrw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv8i16: +; SSE3: # BB#0: +; SSE3-NEXT: pextrw $7, %xmm0, %eax +; SSE3-NEXT: bsrw %ax, %cx +; SSE3-NEXT: movw $31, %ax +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: xorl $15, %ecx +; SSE3-NEXT: movd %ecx, %xmm1 +; SSE3-NEXT: pextrw $3, %xmm0, %ecx +; SSE3-NEXT: bsrw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: xorl $15, %ecx +; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: pextrw $5, %xmm0, %ecx +; SSE3-NEXT: bsrw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: xorl $15, %ecx +; SSE3-NEXT: movd %ecx, %xmm3 +; SSE3-NEXT: pextrw $1, %xmm0, %ecx +; SSE3-NEXT: bsrw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: xorl $15, %ecx +; SSE3-NEXT: movd %ecx, %xmm1 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE3-NEXT: pextrw $6, %xmm0, %ecx +; SSE3-NEXT: bsrw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: xorl $15, %ecx +; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: pextrw $2, %xmm0, %ecx +; SSE3-NEXT: bsrw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: xorl $15, %ecx +; SSE3-NEXT: movd %ecx, %xmm3 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE3-NEXT: pextrw $4, %xmm0, %ecx +; SSE3-NEXT: bsrw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: xorl $15, %ecx +; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: movd %xmm0, %ecx +; SSE3-NEXT: bsrw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: xorl $15, %ecx +; SSE3-NEXT: movd %ecx, %xmm0 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv8i16: +; SSSE3: # BB#0: +; SSSE3-NEXT: pextrw $7, %xmm0, %eax +; SSSE3-NEXT: bsrw %ax, %cx +; SSSE3-NEXT: movw $31, %ax +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: xorl $15, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: pextrw $3, %xmm0, %ecx +; SSSE3-NEXT: bsrw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: xorl $15, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: pextrw $5, %xmm0, %ecx +; SSSE3-NEXT: bsrw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: xorl $15, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: pextrw $1, %xmm0, %ecx +; SSSE3-NEXT: bsrw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: xorl $15, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: pextrw $6, %xmm0, %ecx +; SSSE3-NEXT: bsrw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: xorl $15, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pextrw $2, %xmm0, %ecx +; SSSE3-NEXT: bsrw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: xorl $15, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: pextrw $4, %xmm0, %ecx +; SSSE3-NEXT: bsrw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: xorl $15, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: bsrw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: xorl $15, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv8i16: +; SSE41: # BB#0: +; SSE41-NEXT: pextrw $1, %xmm0, %eax +; SSE41-NEXT: bsrw %ax, %cx +; SSE41-NEXT: movw $31, %ax +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: xorl $15, %ecx +; SSE41-NEXT: movd %xmm0, %edx +; SSE41-NEXT: bsrw %dx, %dx +; SSE41-NEXT: cmovew %ax, %dx +; SSE41-NEXT: xorl $15, %edx +; SSE41-NEXT: movd %edx, %xmm1 +; SSE41-NEXT: pinsrw $1, %ecx, %xmm1 +; SSE41-NEXT: pextrw $2, %xmm0, %ecx +; SSE41-NEXT: bsrw %cx, %cx +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: xorl $15, %ecx +; SSE41-NEXT: pinsrw $2, %ecx, %xmm1 +; SSE41-NEXT: pextrw $3, %xmm0, %ecx +; SSE41-NEXT: bsrw %cx, %cx +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: xorl $15, %ecx +; SSE41-NEXT: pinsrw $3, %ecx, %xmm1 +; SSE41-NEXT: pextrw $4, %xmm0, %ecx +; SSE41-NEXT: bsrw %cx, %cx +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: xorl $15, %ecx +; SSE41-NEXT: pinsrw $4, %ecx, %xmm1 +; SSE41-NEXT: pextrw $5, %xmm0, %ecx +; SSE41-NEXT: bsrw %cx, %cx +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: xorl $15, %ecx +; SSE41-NEXT: pinsrw $5, %ecx, %xmm1 +; SSE41-NEXT: pextrw $6, %xmm0, %ecx +; SSE41-NEXT: bsrw %cx, %cx +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: xorl $15, %ecx +; SSE41-NEXT: pinsrw $6, %ecx, %xmm1 +; SSE41-NEXT: pextrw $7, %xmm0, %ecx +; SSE41-NEXT: bsrw %cx, %cx +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: xorl $15, %ecx +; SSE41-NEXT: pinsrw $7, %ecx, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv8i16: +; AVX: # BB#0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: bsrw %ax, %cx +; AVX-NEXT: movw $31, %ax +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: xorl $15, %ecx +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: bsrw %dx, %dx +; AVX-NEXT: cmovew %ax, %dx +; AVX-NEXT: xorl $15, %edx +; AVX-NEXT: vmovd %edx, %xmm1 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %ecx +; AVX-NEXT: bsrw %cx, %cx +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: xorl $15, %ecx +; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %ecx +; AVX-NEXT: bsrw %cx, %cx +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: xorl $15, %ecx +; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $4, %xmm0, %ecx +; AVX-NEXT: bsrw %cx, %cx +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: xorl $15, %ecx +; AVX-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $5, %xmm0, %ecx +; AVX-NEXT: bsrw %cx, %cx +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: xorl $15, %ecx +; AVX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $6, %xmm0, %ecx +; AVX-NEXT: bsrw %cx, %cx +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: xorl $15, %ecx +; AVX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $7, %xmm0, %ecx +; AVX-NEXT: bsrw %cx, %cx +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: xorl $15, %ecx +; AVX-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm0 +; AVX-NEXT: retq + %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 0) + ret <8 x i16> %out +} + +define <8 x i16> @testv8i16u(<8 x i16> %in) { +; SSE2-LABEL: testv8i16u: +; SSE2: # BB#0: +; SSE2-NEXT: pextrw $7, %xmm0, %eax +; SSE2-NEXT: bsrw %ax, %ax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: bsrw %ax, %ax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: pextrw $5, %xmm0, %eax +; SSE2-NEXT: bsrw %ax, %ax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: pextrw $1, %xmm0, %eax +; SSE2-NEXT: bsrw %ax, %ax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: bsrw %ax, %ax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: bsrw %ax, %ax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: bsrw %ax, %ax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: bsrw %ax, %ax +; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv8i16u: +; SSE3: # BB#0: +; SSE3-NEXT: pextrw $7, %xmm0, %eax +; SSE3-NEXT: bsrw %ax, %ax +; SSE3-NEXT: xorl $15, %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: pextrw $3, %xmm0, %eax +; SSE3-NEXT: bsrw %ax, %ax +; SSE3-NEXT: xorl $15, %eax +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: pextrw $5, %xmm0, %eax +; SSE3-NEXT: bsrw %ax, %ax +; SSE3-NEXT: xorl $15, %eax +; SSE3-NEXT: movd %eax, %xmm3 +; SSE3-NEXT: pextrw $1, %xmm0, %eax +; SSE3-NEXT: bsrw %ax, %ax +; SSE3-NEXT: xorl $15, %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE3-NEXT: pextrw $6, %xmm0, %eax +; SSE3-NEXT: bsrw %ax, %ax +; SSE3-NEXT: xorl $15, %eax +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: pextrw $2, %xmm0, %eax +; SSE3-NEXT: bsrw %ax, %ax +; SSE3-NEXT: xorl $15, %eax +; SSE3-NEXT: movd %eax, %xmm3 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE3-NEXT: pextrw $4, %xmm0, %eax +; SSE3-NEXT: bsrw %ax, %ax +; SSE3-NEXT: xorl $15, %eax +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: bsrw %ax, %ax +; SSE3-NEXT: xorl $15, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv8i16u: +; SSSE3: # BB#0: +; SSSE3-NEXT: pextrw $7, %xmm0, %eax +; SSSE3-NEXT: bsrw %ax, %ax +; SSSE3-NEXT: xorl $15, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pextrw $3, %xmm0, %eax +; SSSE3-NEXT: bsrw %ax, %ax +; SSSE3-NEXT: xorl $15, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: pextrw $5, %xmm0, %eax +; SSSE3-NEXT: bsrw %ax, %ax +; SSSE3-NEXT: xorl $15, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: pextrw $1, %xmm0, %eax +; SSSE3-NEXT: bsrw %ax, %ax +; SSSE3-NEXT: xorl $15, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: pextrw $6, %xmm0, %eax +; SSSE3-NEXT: bsrw %ax, %ax +; SSSE3-NEXT: xorl $15, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: pextrw $2, %xmm0, %eax +; SSSE3-NEXT: bsrw %ax, %ax +; SSSE3-NEXT: xorl $15, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: pextrw $4, %xmm0, %eax +; SSSE3-NEXT: bsrw %ax, %ax +; SSSE3-NEXT: xorl $15, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: bsrw %ax, %ax +; SSSE3-NEXT: xorl $15, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv8i16u: +; SSE41: # BB#0: +; SSE41-NEXT: pextrw $1, %xmm0, %eax +; SSE41-NEXT: bsrw %ax, %ax +; SSE41-NEXT: xorl $15, %eax +; SSE41-NEXT: movd %xmm0, %ecx +; SSE41-NEXT: bsrw %cx, %cx +; SSE41-NEXT: xorl $15, %ecx +; SSE41-NEXT: movd %ecx, %xmm1 +; SSE41-NEXT: pinsrw $1, %eax, %xmm1 +; SSE41-NEXT: pextrw $2, %xmm0, %eax +; SSE41-NEXT: bsrw %ax, %ax +; SSE41-NEXT: xorl $15, %eax +; SSE41-NEXT: pinsrw $2, %eax, %xmm1 +; SSE41-NEXT: pextrw $3, %xmm0, %eax +; SSE41-NEXT: bsrw %ax, %ax +; SSE41-NEXT: xorl $15, %eax +; SSE41-NEXT: pinsrw $3, %eax, %xmm1 +; SSE41-NEXT: pextrw $4, %xmm0, %eax +; SSE41-NEXT: bsrw %ax, %ax +; SSE41-NEXT: xorl $15, %eax +; SSE41-NEXT: pinsrw $4, %eax, %xmm1 +; SSE41-NEXT: pextrw $5, %xmm0, %eax +; SSE41-NEXT: bsrw %ax, %ax +; SSE41-NEXT: xorl $15, %eax +; SSE41-NEXT: pinsrw $5, %eax, %xmm1 +; SSE41-NEXT: pextrw $6, %xmm0, %eax +; SSE41-NEXT: bsrw %ax, %ax +; SSE41-NEXT: xorl $15, %eax +; SSE41-NEXT: pinsrw $6, %eax, %xmm1 +; SSE41-NEXT: pextrw $7, %xmm0, %eax +; SSE41-NEXT: bsrw %ax, %ax +; SSE41-NEXT: xorl $15, %eax +; SSE41-NEXT: pinsrw $7, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv8i16u: +; AVX: # BB#0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: bsrw %ax, %ax +; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: bsrw %cx, %cx +; AVX-NEXT: xorl $15, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: bsrw %ax, %ax +; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: bsrw %ax, %ax +; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $4, %xmm0, %eax +; AVX-NEXT: bsrw %ax, %ax +; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $5, %xmm0, %eax +; AVX-NEXT: bsrw %ax, %ax +; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $6, %xmm0, %eax +; AVX-NEXT: bsrw %ax, %ax +; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $7, %xmm0, %eax +; AVX-NEXT: bsrw %ax, %ax +; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %in, i1 -1) + ret <8 x i16> %out +} + +define <16 x i8> @testv16i8(<16 x i8> %in) { +; SSE2-LABEL: testv16i8: +; SSE2: # BB#0: +; SSE2: pushq %rbp +; SSE2: movaps %xmm0, -24(%rsp) +; SSE2-NEXT: movzbl -9(%rsp), %eax +; SSE2-NEXT: bsrl %eax, %ecx +; SSE2-NEXT: movl $15, %eax +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movzbl -10(%rsp), %ebx +; SSE2-NEXT: movzbl -11(%rsp), %edi +; SSE2-NEXT: movzbl -12(%rsp), %r9d +; SSE2-NEXT: movzbl -13(%rsp), %edx +; SSE2-NEXT: movzbl -14(%rsp), %r11d +; SSE2-NEXT: movzbl -15(%rsp), %esi +; SSE2-NEXT: movzbl -16(%rsp), %r8d +; SSE2-NEXT: movzbl -17(%rsp), %ecx +; SSE2-NEXT: bsrl %ecx, %ecx +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: bsrl %edx, %ecx +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: movzbl -18(%rsp), %edx +; SSE2-NEXT: movzbl -19(%rsp), %ecx +; SSE2-NEXT: movzbl -20(%rsp), %r10d +; SSE2-NEXT: movzbl -21(%rsp), %ebp +; SSE2-NEXT: bsrl %ebp, %ebp +; SSE2-NEXT: cmovel %eax, %ebp +; SSE2-NEXT: xorl $7, %ebp +; SSE2-NEXT: movd %ebp, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: bsrl %edi, %edi +; SSE2-NEXT: cmovel %eax, %edi +; SSE2-NEXT: xorl $7, %edi +; SSE2-NEXT: movd %edi, %xmm1 +; SSE2-NEXT: bsrl %ecx, %ecx +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: bsrl %esi, %ecx +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: movzbl -22(%rsp), %esi +; SSE2-NEXT: movzbl -23(%rsp), %ecx +; SSE2-NEXT: bsrl %ecx, %ecx +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: bsrl %ebx, %ecx +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: bsrl %edx, %ecx +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: bsrl %r11d, %ecx +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: bsrl %esi, %ecx +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: bsrl %r9d, %ecx +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: bsrl %r10d, %ecx +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: bsrl %r8d, %ecx +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: movzbl -24(%rsp), %ecx +; SSE2-NEXT: bsrl %ecx, %ecx +; SSE2-NEXT: cmovel %eax, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv16i8: +; SSE3: # BB#0: +; SSE3: pushq %rbp +; SSE3: movaps %xmm0, -24(%rsp) +; SSE3-NEXT: movzbl -9(%rsp), %eax +; SSE3-NEXT: bsrl %eax, %ecx +; SSE3-NEXT: movl $15, %eax +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm0 +; SSE3-NEXT: movzbl -10(%rsp), %ebx +; SSE3-NEXT: movzbl -11(%rsp), %edi +; SSE3-NEXT: movzbl -12(%rsp), %r9d +; SSE3-NEXT: movzbl -13(%rsp), %edx +; SSE3-NEXT: movzbl -14(%rsp), %r11d +; SSE3-NEXT: movzbl -15(%rsp), %esi +; SSE3-NEXT: movzbl -16(%rsp), %r8d +; SSE3-NEXT: movzbl -17(%rsp), %ecx +; SSE3-NEXT: bsrl %ecx, %ecx +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm1 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE3-NEXT: bsrl %edx, %ecx +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: movzbl -18(%rsp), %edx +; SSE3-NEXT: movzbl -19(%rsp), %ecx +; SSE3-NEXT: movzbl -20(%rsp), %r10d +; SSE3-NEXT: movzbl -21(%rsp), %ebp +; SSE3-NEXT: bsrl %ebp, %ebp +; SSE3-NEXT: cmovel %eax, %ebp +; SSE3-NEXT: xorl $7, %ebp +; SSE3-NEXT: movd %ebp, %xmm0 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE3-NEXT: bsrl %edi, %edi +; SSE3-NEXT: cmovel %eax, %edi +; SSE3-NEXT: xorl $7, %edi +; SSE3-NEXT: movd %edi, %xmm1 +; SSE3-NEXT: bsrl %ecx, %ecx +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE3-NEXT: bsrl %esi, %ecx +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm3 +; SSE3-NEXT: movzbl -22(%rsp), %esi +; SSE3-NEXT: movzbl -23(%rsp), %ecx +; SSE3-NEXT: bsrl %ecx, %ecx +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm1 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE3-NEXT: bsrl %ebx, %ecx +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm0 +; SSE3-NEXT: bsrl %edx, %ecx +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm3 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE3-NEXT: bsrl %r11d, %ecx +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm0 +; SSE3-NEXT: bsrl %esi, %ecx +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE3-NEXT: bsrl %r9d, %ecx +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm0 +; SSE3-NEXT: bsrl %r10d, %ecx +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm3 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE3-NEXT: bsrl %r8d, %ecx +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm4 +; SSE3-NEXT: movzbl -24(%rsp), %ecx +; SSE3-NEXT: bsrl %ecx, %ecx +; SSE3-NEXT: cmovel %eax, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm0 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE3-NEXT: popq %rbx +; SSE3-NEXT: popq %rbp +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv16i8: +; SSSE3: # BB#0: +; SSSE3: pushq %rbp +; SSSE3: movaps %xmm0, -24(%rsp) +; SSSE3-NEXT: movzbl -9(%rsp), %eax +; SSSE3-NEXT: bsrl %eax, %ecx +; SSSE3-NEXT: movl $15, %eax +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movzbl -10(%rsp), %ebx +; SSSE3-NEXT: movzbl -11(%rsp), %edi +; SSSE3-NEXT: movzbl -12(%rsp), %r9d +; SSSE3-NEXT: movzbl -13(%rsp), %edx +; SSSE3-NEXT: movzbl -14(%rsp), %r11d +; SSSE3-NEXT: movzbl -15(%rsp), %esi +; SSSE3-NEXT: movzbl -16(%rsp), %r8d +; SSSE3-NEXT: movzbl -17(%rsp), %ecx +; SSSE3-NEXT: bsrl %ecx, %ecx +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: bsrl %edx, %ecx +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: movzbl -18(%rsp), %edx +; SSSE3-NEXT: movzbl -19(%rsp), %ecx +; SSSE3-NEXT: movzbl -20(%rsp), %r10d +; SSSE3-NEXT: movzbl -21(%rsp), %ebp +; SSSE3-NEXT: bsrl %ebp, %ebp +; SSSE3-NEXT: cmovel %eax, %ebp +; SSSE3-NEXT: xorl $7, %ebp +; SSSE3-NEXT: movd %ebp, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: bsrl %edi, %edi +; SSSE3-NEXT: cmovel %eax, %edi +; SSSE3-NEXT: xorl $7, %edi +; SSSE3-NEXT: movd %edi, %xmm1 +; SSSE3-NEXT: bsrl %ecx, %ecx +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: bsrl %esi, %ecx +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: movzbl -22(%rsp), %esi +; SSSE3-NEXT: movzbl -23(%rsp), %ecx +; SSSE3-NEXT: bsrl %ecx, %ecx +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: bsrl %ebx, %ecx +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: bsrl %edx, %ecx +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: bsrl %r11d, %ecx +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: bsrl %esi, %ecx +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSSE3-NEXT: bsrl %r9d, %ecx +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: bsrl %r10d, %ecx +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: bsrl %r8d, %ecx +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm4 +; SSSE3-NEXT: movzbl -24(%rsp), %ecx +; SSSE3-NEXT: bsrl %ecx, %ecx +; SSSE3-NEXT: cmovel %eax, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv16i8: +; SSE41: # BB#0: +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %ecx +; SSE41-NEXT: movl $15, %eax +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pextrb $0, %xmm0, %edx +; SSE41-NEXT: bsrl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: xorl $7, %edx +; SSE41-NEXT: movd %edx, %xmm1 +; SSE41-NEXT: pinsrb $1, %ecx, %xmm1 +; SSE41-NEXT: pextrb $2, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $2, %ecx, %xmm1 +; SSE41-NEXT: pextrb $3, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $3, %ecx, %xmm1 +; SSE41-NEXT: pextrb $4, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $4, %ecx, %xmm1 +; SSE41-NEXT: pextrb $5, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $5, %ecx, %xmm1 +; SSE41-NEXT: pextrb $6, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $6, %ecx, %xmm1 +; SSE41-NEXT: pextrb $7, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $7, %ecx, %xmm1 +; SSE41-NEXT: pextrb $8, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $8, %ecx, %xmm1 +; SSE41-NEXT: pextrb $9, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $9, %ecx, %xmm1 +; SSE41-NEXT: pextrb $10, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $10, %ecx, %xmm1 +; SSE41-NEXT: pextrb $11, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $11, %ecx, %xmm1 +; SSE41-NEXT: pextrb $12, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $12, %ecx, %xmm1 +; SSE41-NEXT: pextrb $13, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $13, %ecx, %xmm1 +; SSE41-NEXT: pextrb $14, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $14, %ecx, %xmm1 +; SSE41-NEXT: pextrb $15, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: cmovel %eax, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: pinsrb $15, %ecx, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv16i8: +; AVX: # BB#0: +; AVX-NEXT: vpextrb $1, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %ecx +; AVX-NEXT: movl $15, %eax +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpextrb $0, %xmm0, %edx +; AVX-NEXT: bsrl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: xorl $7, %edx +; AVX-NEXT: vmovd %edx, %xmm1 +; AVX-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $2, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $3, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $4, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $5, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $6, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $7, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $8, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $9, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $10, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $11, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $12, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $13, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $14, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $15, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: cmovel %eax, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm0 +; AVX-NEXT: retq + %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 0) + ret <16 x i8> %out +} + +define <16 x i8> @testv16i8u(<16 x i8> %in) { +; SSE2-LABEL: testv16i8u: +; SSE2: # BB#0: +; SSE2: pushq %rbx +; SSE2: movaps %xmm0, -16(%rsp) +; SSE2-NEXT: movzbl -1(%rsp), %eax +; SSE2-NEXT: bsrl %eax, %eax +; SSE2-NEXT: xorl $7, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl -2(%rsp), %edi +; SSE2-NEXT: movzbl -3(%rsp), %edx +; SSE2-NEXT: movzbl -4(%rsp), %r9d +; SSE2-NEXT: movzbl -5(%rsp), %eax +; SSE2-NEXT: movzbl -6(%rsp), %r10d +; SSE2-NEXT: movzbl -7(%rsp), %ecx +; SSE2-NEXT: movzbl -8(%rsp), %r8d +; SSE2-NEXT: movzbl -9(%rsp), %esi +; SSE2-NEXT: bsrl %esi, %esi +; SSE2-NEXT: xorl $7, %esi +; SSE2-NEXT: movd %esi, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: bsrl %eax, %eax +; SSE2-NEXT: xorl $7, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl -10(%rsp), %eax +; SSE2-NEXT: movzbl -11(%rsp), %esi +; SSE2-NEXT: movzbl -12(%rsp), %r11d +; SSE2-NEXT: movzbl -13(%rsp), %ebx +; SSE2-NEXT: bsrl %ebx, %ebx +; SSE2-NEXT: xorl $7, %ebx +; SSE2-NEXT: movd %ebx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: bsrl %edx, %edx +; SSE2-NEXT: xorl $7, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: bsrl %esi, %edx +; SSE2-NEXT: xorl $7, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: bsrl %ecx, %ecx +; SSE2-NEXT: xorl $7, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movzbl -14(%rsp), %ecx +; SSE2-NEXT: movzbl -15(%rsp), %edx +; SSE2-NEXT: bsrl %edx, %edx +; SSE2-NEXT: xorl $7, %edx +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: bsrl %edi, %edx +; SSE2-NEXT: xorl $7, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: bsrl %eax, %eax +; SSE2-NEXT: xorl $7, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: bsrl %r10d, %eax +; SSE2-NEXT: xorl $7, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: bsrl %ecx, %eax +; SSE2-NEXT: xorl $7, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: bsrl %r9d, %eax +; SSE2-NEXT: xorl $7, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: bsrl %r11d, %eax +; SSE2-NEXT: xorl $7, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: bsrl %r8d, %eax +; SSE2-NEXT: xorl $7, %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl -16(%rsp), %eax +; SSE2-NEXT: bsrl %eax, %eax +; SSE2-NEXT: xorl $7, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv16i8u: +; SSE3: # BB#0: +; SSE3: pushq %rbx +; SSE3: movaps %xmm0, -16(%rsp) +; SSE3-NEXT: movzbl -1(%rsp), %eax +; SSE3-NEXT: bsrl %eax, %eax +; SSE3-NEXT: xorl $7, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: movzbl -2(%rsp), %edi +; SSE3-NEXT: movzbl -3(%rsp), %edx +; SSE3-NEXT: movzbl -4(%rsp), %r9d +; SSE3-NEXT: movzbl -5(%rsp), %eax +; SSE3-NEXT: movzbl -6(%rsp), %r10d +; SSE3-NEXT: movzbl -7(%rsp), %ecx +; SSE3-NEXT: movzbl -8(%rsp), %r8d +; SSE3-NEXT: movzbl -9(%rsp), %esi +; SSE3-NEXT: bsrl %esi, %esi +; SSE3-NEXT: xorl $7, %esi +; SSE3-NEXT: movd %esi, %xmm1 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE3-NEXT: bsrl %eax, %eax +; SSE3-NEXT: xorl $7, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: movzbl -10(%rsp), %eax +; SSE3-NEXT: movzbl -11(%rsp), %esi +; SSE3-NEXT: movzbl -12(%rsp), %r11d +; SSE3-NEXT: movzbl -13(%rsp), %ebx +; SSE3-NEXT: bsrl %ebx, %ebx +; SSE3-NEXT: xorl $7, %ebx +; SSE3-NEXT: movd %ebx, %xmm2 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE3-NEXT: bsrl %edx, %edx +; SSE3-NEXT: xorl $7, %edx +; SSE3-NEXT: movd %edx, %xmm0 +; SSE3-NEXT: bsrl %esi, %edx +; SSE3-NEXT: xorl $7, %edx +; SSE3-NEXT: movd %edx, %xmm3 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE3-NEXT: bsrl %ecx, %ecx +; SSE3-NEXT: xorl $7, %ecx +; SSE3-NEXT: movd %ecx, %xmm0 +; SSE3-NEXT: movzbl -14(%rsp), %ecx +; SSE3-NEXT: movzbl -15(%rsp), %edx +; SSE3-NEXT: bsrl %edx, %edx +; SSE3-NEXT: xorl $7, %edx +; SSE3-NEXT: movd %edx, %xmm1 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE3-NEXT: bsrl %edi, %edx +; SSE3-NEXT: xorl $7, %edx +; SSE3-NEXT: movd %edx, %xmm0 +; SSE3-NEXT: bsrl %eax, %eax +; SSE3-NEXT: xorl $7, %eax +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE3-NEXT: bsrl %r10d, %eax +; SSE3-NEXT: xorl $7, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: bsrl %ecx, %eax +; SSE3-NEXT: xorl $7, %eax +; SSE3-NEXT: movd %eax, %xmm3 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE3-NEXT: bsrl %r9d, %eax +; SSE3-NEXT: xorl $7, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: bsrl %r11d, %eax +; SSE3-NEXT: xorl $7, %eax +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE3-NEXT: bsrl %r8d, %eax +; SSE3-NEXT: xorl $7, %eax +; SSE3-NEXT: movd %eax, %xmm4 +; SSE3-NEXT: movzbl -16(%rsp), %eax +; SSE3-NEXT: bsrl %eax, %eax +; SSE3-NEXT: xorl $7, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE3-NEXT: popq %rbx +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv16i8u: +; SSSE3: # BB#0: +; SSSE3: pushq %rbx +; SSSE3: movaps %xmm0, -16(%rsp) +; SSSE3-NEXT: movzbl -1(%rsp), %eax +; SSSE3-NEXT: bsrl %eax, %eax +; SSSE3-NEXT: xorl $7, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl -2(%rsp), %edi +; SSSE3-NEXT: movzbl -3(%rsp), %edx +; SSSE3-NEXT: movzbl -4(%rsp), %r9d +; SSSE3-NEXT: movzbl -5(%rsp), %eax +; SSSE3-NEXT: movzbl -6(%rsp), %r10d +; SSSE3-NEXT: movzbl -7(%rsp), %ecx +; SSSE3-NEXT: movzbl -8(%rsp), %r8d +; SSSE3-NEXT: movzbl -9(%rsp), %esi +; SSSE3-NEXT: bsrl %esi, %esi +; SSSE3-NEXT: xorl $7, %esi +; SSSE3-NEXT: movd %esi, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: bsrl %eax, %eax +; SSSE3-NEXT: xorl $7, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl -10(%rsp), %eax +; SSSE3-NEXT: movzbl -11(%rsp), %esi +; SSSE3-NEXT: movzbl -12(%rsp), %r11d +; SSSE3-NEXT: movzbl -13(%rsp), %ebx +; SSSE3-NEXT: bsrl %ebx, %ebx +; SSSE3-NEXT: xorl $7, %ebx +; SSSE3-NEXT: movd %ebx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: bsrl %edx, %edx +; SSSE3-NEXT: xorl $7, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: bsrl %esi, %edx +; SSSE3-NEXT: xorl $7, %edx +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: bsrl %ecx, %ecx +; SSSE3-NEXT: xorl $7, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movzbl -14(%rsp), %ecx +; SSSE3-NEXT: movzbl -15(%rsp), %edx +; SSSE3-NEXT: bsrl %edx, %edx +; SSSE3-NEXT: xorl $7, %edx +; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: bsrl %edi, %edx +; SSSE3-NEXT: xorl $7, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: bsrl %eax, %eax +; SSSE3-NEXT: xorl $7, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: bsrl %r10d, %eax +; SSSE3-NEXT: xorl $7, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: bsrl %ecx, %eax +; SSSE3-NEXT: xorl $7, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: bsrl %r9d, %eax +; SSSE3-NEXT: xorl $7, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: bsrl %r11d, %eax +; SSSE3-NEXT: xorl $7, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: bsrl %r8d, %eax +; SSSE3-NEXT: xorl $7, %eax +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl -16(%rsp), %eax +; SSSE3-NEXT: bsrl %eax, %eax +; SSSE3-NEXT: xorl $7, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv16i8u: +; SSE41: # BB#0: +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pextrb $0, %xmm0, %ecx +; SSE41-NEXT: bsrl %ecx, %ecx +; SSE41-NEXT: xorl $7, %ecx +; SSE41-NEXT: movd %ecx, %xmm1 +; SSE41-NEXT: pinsrb $1, %eax, %xmm1 +; SSE41-NEXT: pextrb $2, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm1 +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: pextrb $4, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm1 +; SSE41-NEXT: pextrb $5, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm1 +; SSE41-NEXT: pextrb $6, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm1 +; SSE41-NEXT: pextrb $7, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm1 +; SSE41-NEXT: pextrb $8, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm1 +; SSE41-NEXT: pextrb $9, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm1 +; SSE41-NEXT: pextrb $10, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm1 +; SSE41-NEXT: pextrb $11, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm1 +; SSE41-NEXT: pextrb $12, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm1 +; SSE41-NEXT: pextrb $13, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $13, %eax, %xmm1 +; SSE41-NEXT: pextrb $14, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm1 +; SSE41-NEXT: pextrb $15, %xmm0, %eax +; SSE41-NEXT: bsrl %eax, %eax +; SSE41-NEXT: xorl $7, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv16i8u: +; AVX: # BB#0: +; AVX-NEXT: vpextrb $1, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpextrb $0, %xmm0, %ecx +; AVX-NEXT: bsrl %ecx, %ecx +; AVX-NEXT: xorl $7, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $2, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $3, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $4, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $5, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $6, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $7, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $8, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $9, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $10, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $11, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $12, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $13, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $14, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $15, %xmm0, %eax +; AVX-NEXT: bsrl %eax, %eax +; AVX-NEXT: xorl $7, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %in, i1 -1) + ret <16 x i8> %out +} + +define <2 x i64> @foldv2i64() { +; SSE-LABEL: foldv2i64: +; SSE: # BB#0: +; SSE-NEXT: movl $55, %eax +; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: foldv2i64: +; AVX: # BB#0: +; AVX-NEXT: movl $55, %eax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: retq + %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0) + ret <2 x i64> %out +} + +define <2 x i64> @foldv2i64u() { +; SSE-LABEL: foldv2i64u: +; SSE: # BB#0: +; SSE-NEXT: movl $55, %eax +; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: foldv2i64u: +; AVX: # BB#0: +; AVX-NEXT: movl $55, %eax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: retq + %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1) + ret <2 x i64> %out +} + +define <4 x i32> @foldv4i32() { +; SSE-LABEL: foldv4i32: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX-NEXT: retq + %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0) + ret <4 x i32> %out +} + +define <4 x i32> @foldv4i32u() { +; SSE-LABEL: foldv4i32u: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv4i32u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX-NEXT: retq + %out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1) + ret <4 x i32> %out +} + +define <8 x i16> @foldv8i16() { +; SSE-LABEL: foldv8i16: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv8i16: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX-NEXT: retq + %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0) + ret <8 x i16> %out +} + +define <8 x i16> @foldv8i16u() { +; SSE-LABEL: foldv8i16u: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv8i16u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX-NEXT: retq + %out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1) + ret <8 x i16> %out +} + +define <16 x i8> @foldv16i8() { +; SSE-LABEL: foldv16i8: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv16i8: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX-NEXT: retq + %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0) + ret <16 x i8> %out +} + +define <16 x i8> @foldv16i8u() { +; SSE-LABEL: foldv16i8u: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv16i8u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX-NEXT: retq + %out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1) + ret <16 x i8> %out +} + +declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) +declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) +declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll new file mode 100644 index 000000000000..48abe1290528 --- /dev/null +++ b/test/CodeGen/X86/vector-lzcnt-256.ll @@ -0,0 +1,1305 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <4 x i64> @testv4i64(<4 x i64> %in) { +; AVX1-LABEL: testv4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: bsrq %rax, %rax +; AVX1-NEXT: movl $127, %ecx +; AVX1-NEXT: cmoveq %rcx, %rax +; AVX1-NEXT: xorq $63, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: bsrq %rax, %rax +; AVX1-NEXT: cmoveq %rcx, %rax +; AVX1-NEXT: xorq $63, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: bsrq %rax, %rax +; AVX1-NEXT: cmoveq %rcx, %rax +; AVX1-NEXT: xorq $63, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: bsrq %rax, %rax +; AVX1-NEXT: cmoveq %rcx, %rax +; AVX1-NEXT: xorq $63, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: bsrq %rax, %rax +; AVX2-NEXT: movl $127, %ecx +; AVX2-NEXT: cmoveq %rcx, %rax +; AVX2-NEXT: xorq $63, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: bsrq %rax, %rax +; AVX2-NEXT: cmoveq %rcx, %rax +; AVX2-NEXT: xorq $63, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: bsrq %rax, %rax +; AVX2-NEXT: cmoveq %rcx, %rax +; AVX2-NEXT: xorq $63, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: bsrq %rax, %rax +; AVX2-NEXT: cmoveq %rcx, %rax +; AVX2-NEXT: xorq $63, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 0) + ret <4 x i64> %out +} + +define <4 x i64> @testv4i64u(<4 x i64> %in) { +; AVX1-LABEL: testv4i64u: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: bsrq %rax, %rax +; AVX1-NEXT: xorq $63, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: bsrq %rax, %rax +; AVX1-NEXT: xorq $63, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: bsrq %rax, %rax +; AVX1-NEXT: xorq $63, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: bsrq %rax, %rax +; AVX1-NEXT: xorq $63, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv4i64u: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: bsrq %rax, %rax +; AVX2-NEXT: xorq $63, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: bsrq %rax, %rax +; AVX2-NEXT: xorq $63, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: bsrq %rax, %rax +; AVX2-NEXT: xorq $63, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: bsrq %rax, %rax +; AVX2-NEXT: xorq $63, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %in, i1 -1) + ret <4 x i64> %out +} + +define <8 x i32> @testv8i32(<8 x i32> %in) { +; AVX1-LABEL: testv8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %ecx +; AVX1-NEXT: movl $63, %eax +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $31, %ecx +; AVX1-NEXT: vmovd %xmm1, %edx +; AVX1-NEXT: bsrl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: xorl $31, %edx +; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $31, %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $31, %ecx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $31, %ecx +; AVX1-NEXT: vmovd %xmm0, %edx +; AVX1-NEXT: bsrl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: xorl $31, %edx +; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $31, %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $31, %ecx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrd $1, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %ecx +; AVX2-NEXT: movl $63, %eax +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $31, %ecx +; AVX2-NEXT: vmovd %xmm1, %edx +; AVX2-NEXT: bsrl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: xorl $31, %edx +; AVX2-NEXT: vmovd %edx, %xmm2 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $31, %ecx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $3, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $31, %ecx +; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1 +; AVX2-NEXT: vpextrd $1, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $31, %ecx +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: bsrl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: xorl $31, %edx +; AVX2-NEXT: vmovd %edx, %xmm2 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $2, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $31, %ecx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $31, %ecx +; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 0) + ret <8 x i32> %out +} + +define <8 x i32> @testv8i32u(<8 x i32> %in) { +; AVX1-LABEL: testv8i32u: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $31, %eax +; AVX1-NEXT: vmovd %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: xorl $31, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $31, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $31, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $31, %eax +; AVX1-NEXT: vmovd %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: xorl $31, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $31, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $31, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i32u: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrd $1, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $31, %eax +; AVX2-NEXT: vmovd %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: xorl $31, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $2, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $31, %eax +; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $3, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $31, %eax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 +; AVX2-NEXT: vpextrd $1, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $31, %eax +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: xorl $31, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $2, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $31, %eax +; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $31, %eax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %in, i1 -1) + ret <8 x i32> %out +} + +define <16 x i16> @testv16i16(<16 x i16> %in) { +; AVX1-LABEL: testv16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrw $1, %xmm1, %eax +; AVX1-NEXT: bsrw %ax, %cx +; AVX1-NEXT: movw $31, %ax +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vmovd %xmm1, %edx +; AVX1-NEXT: bsrw %dx, %dx +; AVX1-NEXT: cmovew %ax, %dx +; AVX1-NEXT: xorl $15, %edx +; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $2, %xmm1, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $3, %xmm1, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $4, %xmm1, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $5, %xmm1, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $6, %xmm1, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $7, %xmm1, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1 +; AVX1-NEXT: vpextrw $1, %xmm0, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vmovd %xmm0, %edx +; AVX1-NEXT: bsrw %dx, %dx +; AVX1-NEXT: cmovew %ax, %dx +; AVX1-NEXT: xorl $15, %edx +; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $2, %xmm0, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $3, %xmm0, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $4, %xmm0, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $5, %xmm0, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $6, %xmm0, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $7, %xmm0, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrw $1, %xmm1, %eax +; AVX2-NEXT: bsrw %ax, %cx +; AVX2-NEXT: movw $31, %ax +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vmovd %xmm1, %edx +; AVX2-NEXT: bsrw %dx, %dx +; AVX2-NEXT: cmovew %ax, %dx +; AVX2-NEXT: xorl $15, %edx +; AVX2-NEXT: vmovd %edx, %xmm2 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $2, %xmm1, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $3, %xmm1, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $4, %xmm1, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $5, %xmm1, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $6, %xmm1, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $7, %xmm1, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1 +; AVX2-NEXT: vpextrw $1, %xmm0, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: bsrw %dx, %dx +; AVX2-NEXT: cmovew %ax, %dx +; AVX2-NEXT: xorl $15, %edx +; AVX2-NEXT: vmovd %edx, %xmm2 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $2, %xmm0, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $3, %xmm0, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $4, %xmm0, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $5, %xmm0, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $6, %xmm0, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $7, %xmm0, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 0) + ret <16 x i16> %out +} + +define <16 x i16> @testv16i16u(<16 x i16> %in) { +; AVX1-LABEL: testv16i16u: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrw $1, %xmm1, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vmovd %xmm1, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $2, %xmm1, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $3, %xmm1, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $4, %xmm1, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $5, %xmm1, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $6, %xmm1, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $7, %xmm1, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 +; AVX1-NEXT: vpextrw $1, %xmm0, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vmovd %xmm0, %ecx +; AVX1-NEXT: bsrw %cx, %cx +; AVX1-NEXT: xorl $15, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $2, %xmm0, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $3, %xmm0, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $4, %xmm0, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $5, %xmm0, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $6, %xmm0, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $7, %xmm0, %eax +; AVX1-NEXT: bsrw %ax, %ax +; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i16u: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrw $1, %xmm1, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vmovd %xmm1, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $2, %xmm1, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $3, %xmm1, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $4, %xmm1, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $5, %xmm1, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $6, %xmm1, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $7, %xmm1, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 +; AVX2-NEXT: vpextrw $1, %xmm0, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: bsrw %cx, %cx +; AVX2-NEXT: xorl $15, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $2, %xmm0, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $3, %xmm0, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $4, %xmm0, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $5, %xmm0, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $6, %xmm0, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $7, %xmm0, %eax +; AVX2-NEXT: bsrw %ax, %ax +; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %in, i1 -1) + ret <16 x i16> %out +} + +define <32 x i8> @testv32i8(<32 x i8> %in) { +; AVX1-LABEL: testv32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrb $1, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %ecx +; AVX1-NEXT: movl $15, %eax +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpextrb $0, %xmm1, %edx +; AVX1-NEXT: bsrl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: xorl $7, %edx +; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $2, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $3, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $4, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $5, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $6, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $7, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $8, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $9, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $10, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $11, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $12, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $13, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $14, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $15, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1 +; AVX1-NEXT: vpextrb $1, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpextrb $0, %xmm0, %edx +; AVX1-NEXT: bsrl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: xorl $7, %edx +; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $2, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $3, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $4, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $5, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $6, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $7, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $8, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $9, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $10, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $11, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $12, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $13, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $14, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $15, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrb $1, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %ecx +; AVX2-NEXT: movl $15, %eax +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpextrb $0, %xmm1, %edx +; AVX2-NEXT: bsrl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: xorl $7, %edx +; AVX2-NEXT: vmovd %edx, %xmm2 +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $2, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $3, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $4, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $5, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $6, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $7, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $8, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $9, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $10, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $11, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $12, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $13, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $14, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $15, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm1 +; AVX2-NEXT: vpextrb $1, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpextrb $0, %xmm0, %edx +; AVX2-NEXT: bsrl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: xorl $7, %edx +; AVX2-NEXT: vmovd %edx, %xmm2 +; AVX2-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $2, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $3, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $4, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $4, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $5, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $5, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $6, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $7, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $8, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $9, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $10, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $11, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $12, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $13, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $14, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $15, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vpinsrb $15, %ecx, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 0) + ret <32 x i8> %out +} + +define <32 x i8> @testv32i8u(<32 x i8> %in) { +; AVX1-LABEL: testv32i8u: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrb $1, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpextrb $0, %xmm1, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $2, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $3, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $4, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $5, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $6, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $7, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $9, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $10, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $11, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $12, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $13, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $14, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $15, %xmm1, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpextrb $0, %xmm0, %ecx +; AVX1-NEXT: bsrl %ecx, %ecx +; AVX1-NEXT: xorl $7, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: bsrl %eax, %eax +; AVX1-NEXT: xorl $7, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv32i8u: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrb $1, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpextrb $0, %xmm1, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $2, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $3, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $4, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $5, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $6, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $7, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $8, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $9, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $10, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $11, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $12, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $13, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $14, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $15, %xmm1, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpextrb $0, %xmm0, %ecx +; AVX2-NEXT: bsrl %ecx, %ecx +; AVX2-NEXT: xorl $7, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: bsrl %eax, %eax +; AVX2-NEXT: xorl $7, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %in, i1 -1) + ret <32 x i8> %out +} + +define <4 x i64> @foldv4i64() { +; AVX-LABEL: foldv4i64: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; AVX-NEXT: retq + %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0) + ret <4 x i64> %out +} + +define <4 x i64> @foldv4i64u() { +; AVX-LABEL: foldv4i64u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; AVX-NEXT: retq + %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1) + ret <4 x i64> %out +} + +define <8 x i32> @foldv8i32() { +; AVX-LABEL: foldv8i32: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; AVX-NEXT: retq + %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0) + ret <8 x i32> %out +} + +define <8 x i32> @foldv8i32u() { +; AVX-LABEL: foldv8i32u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; AVX-NEXT: retq + %out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1) + ret <8 x i32> %out +} + +define <16 x i16> @foldv16i16() { +; AVX-LABEL: foldv16i16: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX-NEXT: retq + %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0) + ret <16 x i16> %out +} + +define <16 x i16> @foldv16i16u() { +; AVX-LABEL: foldv16i16u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX-NEXT: retq + %out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1) + ret <16 x i16> %out +} + +define <32 x i8> @foldv32i8() { +; AVX-LABEL: foldv32i8: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX-NEXT: retq + %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0) + ret <32 x i8> %out +} + +define <32 x i8> @foldv32i8u() { +; AVX-LABEL: foldv32i8u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX-NEXT: retq + %out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1) + ret <32 x i8> %out +} + +declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) +declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1) +declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>, i1) +declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>, i1) diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll new file mode 100644 index 000000000000..fef445de04ab --- /dev/null +++ b/test/CodeGen/X86/vector-popcnt-128.ll @@ -0,0 +1,462 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <2 x i64> @testv2i64(<2 x i64> %in) { +; SSE2-LABEL: testv2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlq $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlq $4, %xmm1 +; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv2i64: +; SSE3: # BB#0: +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubq %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [3689348814741910323,3689348814741910323] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlq $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddq %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlq $4, %xmm1 +; SSE3-NEXT: paddq %xmm0, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: psadbw %xmm0, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv2i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pshufb %xmm2, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm3 +; SSSE3-NEXT: paddb %xmm4, %xmm3 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: psadbw %xmm3, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv2i64: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm3 +; SSE41-NEXT: paddb %xmm4, %xmm3 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: psadbw %xmm3, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv2i64: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in) + ret <2 x i64> %out +} + +define <4 x i32> @testv4i32(<4 x i32> %in) { +; SSE2-LABEL: testv4i32: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrld $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $4, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: psadbw %xmm0, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: psadbw %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv4i32: +; SSE3: # BB#0: +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubd %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [858993459,858993459,858993459,858993459] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrld $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddd %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrld $4, %xmm1 +; SSE3-NEXT: paddd %xmm0, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: pxor %xmm0, %xmm0 +; SSE3-NEXT: movdqa %xmm1, %xmm2 +; SSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE3-NEXT: psadbw %xmm0, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE3-NEXT: psadbw %xmm0, %xmm1 +; SSE3-NEXT: packuswb %xmm2, %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv4i32: +; SSSE3: # BB#0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 +; SSSE3-NEXT: paddb %xmm4, %xmm1 +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: movdqa %xmm1, %xmm2 +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: psadbw %xmm0, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: psadbw %xmm0, %xmm1 +; SSSE3-NEXT: packuswb %xmm2, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv4i32: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pshufb %xmm3, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: paddb %xmm4, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE41-NEXT: psadbw %xmm0, %xmm2 +; SSE41-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE41-NEXT: psadbw %xmm0, %xmm1 +; SSE41-NEXT: packuswb %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpsadbw %xmm2, %xmm1, %xmm2 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vpsadbw %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in) + ret <4 x i32> %out +} + +define <8 x i16> @testv8i16(<8 x i16> %in) { +; SSE2-LABEL: testv8i16: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: paddb %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv8i16: +; SSE3: # BB#0: +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubw %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [13107,13107,13107,13107,13107,13107,13107,13107] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddw %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddw %xmm0, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: psllw $8, %xmm0 +; SSE3-NEXT: paddb %xmm1, %xmm0 +; SSE3-NEXT: psrlw $8, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv8i16: +; SSSE3: # BB#0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: pand %xmm1, %xmm2 +; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pshufb %xmm2, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm1, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm3 +; SSSE3-NEXT: paddb %xmm4, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, %xmm0 +; SSSE3-NEXT: psllw $8, %xmm0 +; SSSE3-NEXT: paddb %xmm3, %xmm0 +; SSSE3-NEXT: psrlw $8, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv8i16: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: pshufb %xmm2, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm3 +; SSE41-NEXT: paddb %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: psllw $8, %xmm0 +; SSE41-NEXT: paddb %xmm3, %xmm0 +; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv8i16: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX-NEXT: retq + %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in) + ret <8 x i16> %out +} + +define <16 x i8> @testv16i8(<16 x i8> %in) { +; SSE2-LABEL: testv16i8: +; SSE2: # BB#0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $1, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: psrlw $2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrlw $4, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv16i8: +; SSE3: # BB#0: +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $1, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: psubb %xmm1, %xmm0 +; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; SSE3-NEXT: movdqa %xmm0, %xmm2 +; SSE3-NEXT: pand %xmm1, %xmm2 +; SSE3-NEXT: psrlw $2, %xmm0 +; SSE3-NEXT: pand %xmm1, %xmm0 +; SSE3-NEXT: paddb %xmm2, %xmm0 +; SSE3-NEXT: movdqa %xmm0, %xmm1 +; SSE3-NEXT: psrlw $4, %xmm1 +; SSE3-NEXT: paddb %xmm0, %xmm1 +; SSE3-NEXT: pand {{.*}}(%rip), %xmm1 +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv16i8: +; SSSE3: # BB#0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm3 +; SSSE3-NEXT: pand %xmm2, %xmm3 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSSE3-NEXT: movdqa %xmm1, %xmm4 +; SSSE3-NEXT: pshufb %xmm3, %xmm4 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: pshufb %xmm0, %xmm1 +; SSSE3-NEXT: paddb %xmm4, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv16i8: +; SSE41: # BB#0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pand %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; SSE41-NEXT: movdqa %xmm1, %xmm4 +; SSE41-NEXT: pshufb %xmm3, %xmm4 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pshufb %xmm0, %xmm1 +; SSE41-NEXT: paddb %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv16i8: +; AVX: # BB#0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: retq + %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in) + ret <16 x i8> %out +} + +define <2 x i64> @foldv2i64() { +; SSE-LABEL: foldv2i64: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,64] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv2i64: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64] +; AVX-NEXT: retq + %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> <i64 256, i64 -1>) + ret <2 x i64> %out +} + +define <4 x i32> @foldv4i32() { +; SSE-LABEL: foldv4i32: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,32,0,8] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8] +; AVX-NEXT: retq + %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>) + ret <4 x i32> %out +} + +define <8 x i16> @foldv8i16() { +; SSE-LABEL: foldv8i16: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv8i16: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3] +; AVX-NEXT: retq + %out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>) + ret <8 x i16> %out +} + +define <16 x i8> @foldv16i8() { +; SSE-LABEL: foldv16i8: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv16i8: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1] +; AVX-NEXT: retq + %out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>) + ret <16 x i8> %out +} + +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) +declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) +declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) +declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll new file mode 100644 index 000000000000..7ce4f712483a --- /dev/null +++ b/test/CodeGen/X86/vector-popcnt-256.ll @@ -0,0 +1,220 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <4 x i64> @testv4i64(<4 x i64> %in) { +; AVX1-LABEL: testv4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpsadbw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq + %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in) + ret <4 x i64> %out +} + +define <8 x i32> @testv8i32(<8 x i32> %in) { +; AVX1-LABEL: testv8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm5, %xmm3, %xmm5 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; AVX1-NEXT: vpsadbw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX1-NEXT: vpsadbw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-NEXT: vpsadbw %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; AVX2-NEXT: vpsadbw %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in) + ret <8 x i32> %out +} + +define <16 x i16> @testv16i16(<16 x i16> %in) { +; AVX1-LABEL: testv16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm4 +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 +; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in) + ret <16 x i16> %out +} + +define <32 x i8> @testv32i8(<32 x i8> %in) { +; AVX1-LABEL: testv32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in) + ret <32 x i8> %out +} + +define <4 x i64> @foldv4i64() { +; AVX-LABEL: foldv4i64: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,64,0,8] +; AVX-NEXT: retq + %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>) + ret <4 x i64> %out +} + +define <8 x i32> @foldv8i32() { +; AVX-LABEL: foldv8i32: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3] +; AVX-NEXT: retq + %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>) + ret <8 x i32> %out +} + +define <16 x i16> @foldv16i16() { +; AVX-LABEL: foldv16i16: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1] +; AVX-NEXT: retq + %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>) + ret <16 x i16> %out +} + +define <32 x i8> @foldv32i8() { +; AVX-LABEL: foldv32i8: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7] +; AVX-NEXT: retq + %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>) + ret <32 x i8> %out +} + +declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) +declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) +declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) +declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>) diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 1b42a637907c..944ec4b8d3ac 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -843,7 +843,6 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) { define <4 x double> @insert_reg_and_zero_v4f64(double %a) { ; ALL-LABEL: insert_reg_and_zero_v4f64: ; ALL: # BB#0: -; ALL-NEXT: # kill: XMM0<def> XMM0<kill> YMM0<def> ; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; ALL-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index 62d4af7809b6..8dc76231856a 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -15,9 +15,8 @@ define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00000010: ; ALL: # BB#0: -; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> ret <8 x double> %shuffle @@ -26,9 +25,8 @@ define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00000200: ; ALL: # BB#0: -; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> ret <8 x double> %shuffle @@ -37,9 +35,8 @@ define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00003000: ; ALL: # BB#0: -; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> ret <8 x double> %shuffle @@ -48,11 +45,8 @@ define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00040000: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 -; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 -; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> ret <8 x double> %shuffle @@ -61,11 +55,8 @@ define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00500000: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3] -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,0] -; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x double> %shuffle @@ -74,11 +65,8 @@ define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_06000000: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3] -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,0] -; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x double> %shuffle @@ -87,11 +75,11 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_70000000: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3] -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,0,0,0] -; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; ALL-NEXT: movl $7, %eax +; ALL-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2 +; ALL-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x double> %shuffle @@ -100,10 +88,7 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vpermpd $68, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> ret <8 x double> %shuffle @@ -112,9 +97,8 @@ define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00112233: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> ret <8 x double> %shuffle @@ -123,9 +107,8 @@ define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00001111: ; ALL: # BB#0: -; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> ret <8 x double> %shuffle @@ -134,11 +117,7 @@ define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_81a3c5e7: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3] -; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vshufpd $170, %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> ret <8 x double> %shuffle @@ -147,10 +126,9 @@ define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08080808: ; ALL: # BB#0: -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> ret <8 x double> %shuffle @@ -159,15 +137,9 @@ define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08084c4c: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12> ret <8 x double> %shuffle @@ -176,13 +148,9 @@ define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_8823cc67: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; ALL-NEXT: vbroadcastsd %xmm3, %ymm3 -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3] -; ALL-NEXT: vbroadcastsd %xmm1, %ymm1 -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7> ret <8 x double> %shuffle @@ -191,13 +159,9 @@ define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_9832dc76: ; ALL: # BB#0: -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3] -; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6> ret <8 x double> %shuffle @@ -206,13 +170,9 @@ define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_9810dc54: ; ALL: # BB#0: -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 -; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2] -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4> ret <8 x double> %shuffle @@ -221,15 +181,9 @@ define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08194c5d: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> ret <8 x double> %shuffle @@ -238,15 +192,9 @@ define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_2a3b6e7f: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> ret <8 x double> %shuffle @@ -255,13 +203,9 @@ define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08192a3b: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,2,2,3] -; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> ret <8 x double> %shuffle @@ -270,11 +214,9 @@ define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_08991abb: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,1,1] -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11> ret <8 x double> %shuffle @@ -283,12 +225,9 @@ define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_091b2d3f: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> ret <8 x double> %shuffle @@ -297,11 +236,9 @@ define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_09ab1def: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2] -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> ret <8 x double> %shuffle @@ -310,10 +247,7 @@ define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00014445: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd $64, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5> ret <8 x double> %shuffle @@ -322,10 +256,7 @@ define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00204464: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd $32, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4> ret <8 x double> %shuffle @@ -334,10 +265,7 @@ define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_03004744: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd $12, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4> ret <8 x double> %shuffle @@ -346,10 +274,7 @@ define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10005444: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd $1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4> ret <8 x double> %shuffle @@ -358,10 +283,7 @@ define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_22006644: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd $10, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4> ret <8 x double> %shuffle @@ -370,10 +292,7 @@ define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_33307774: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd $63, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4> ret <8 x double> %shuffle @@ -382,10 +301,7 @@ define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_32107654: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermpd $27, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> ret <8 x double> %shuffle @@ -394,10 +310,7 @@ define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00234467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $136, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7> ret <8 x double> %shuffle @@ -406,10 +319,7 @@ define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00224466: ; ALL: # BB#0: -; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $0, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> ret <8 x double> %shuffle @@ -418,10 +328,7 @@ define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10325476: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $85, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> ret <8 x double> %shuffle @@ -430,10 +337,7 @@ define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_11335577: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $255, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> ret <8 x double> %shuffle @@ -442,10 +346,7 @@ define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10235467: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,3] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $153, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> ret <8 x double> %shuffle @@ -454,10 +355,7 @@ define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10225466: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,2] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $17, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6> ret <8 x double> %shuffle @@ -466,10 +364,8 @@ define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00015444: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> ret <8 x double> %shuffle @@ -478,10 +374,8 @@ define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00204644: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> ret <8 x double> %shuffle @@ -490,10 +384,8 @@ define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_03004474: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> ret <8 x double> %shuffle @@ -502,10 +394,8 @@ define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10004444: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x double> %shuffle @@ -514,10 +404,8 @@ define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_22006446: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,2] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> ret <8 x double> %shuffle @@ -526,10 +414,8 @@ define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_33307474: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,3,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> ret <8 x double> %shuffle @@ -538,9 +424,8 @@ define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_32104567: ; ALL: # BB#0: -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> ret <8 x double> %shuffle @@ -549,10 +434,8 @@ define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00236744: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> ret <8 x double> %shuffle @@ -561,10 +444,8 @@ define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00226644: ; ALL: # BB#0: -; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> ret <8 x double> %shuffle @@ -573,9 +454,7 @@ define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_10324567: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $165, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> ret <8 x double> %shuffle @@ -584,9 +463,7 @@ define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_11334567: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $175, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x double> %shuffle @@ -595,9 +472,7 @@ define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01235467: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd $154, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> ret <8 x double> %shuffle @@ -606,9 +481,7 @@ define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_01235466: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd $26, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> ret <8 x double> %shuffle @@ -617,10 +490,8 @@ define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_002u6u44: ; ALL: # BB#0: -; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,0] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> ret <8 x double> %shuffle @@ -629,10 +500,8 @@ define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_00uu66uu: ; ALL: # BB#0: -; ALL-NEXT: vbroadcastsd %xmm0, %ymm1 -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> ret <8 x double> %shuffle @@ -641,9 +510,7 @@ define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_103245uu: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $37, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> ret <8 x double> %shuffle @@ -652,9 +519,7 @@ define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_1133uu67: ; ALL: # BB#0: -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $143, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> ret <8 x double> %shuffle @@ -663,9 +528,7 @@ define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_0uu354uu: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2] -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd $24, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> ret <8 x double> %shuffle @@ -674,9 +537,7 @@ define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_uuu3uu66: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd $8, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> ret <8 x double> %shuffle @@ -685,16 +546,9 @@ define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_c348cda0: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[0,1] -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 -; ALL-NEXT: vbroadcastsd %xmm1, %ymm4 -; ALL-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3] -; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3] -; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3] -; ALL-NEXT: vbroadcastsd %xmm0, %ymm0 -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3] -; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0> ret <8 x double> %shuffle @@ -703,17 +557,9 @@ define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_f511235a: ; ALL: # BB#0: -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; ALL-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3] -; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,3,1,3] -; ALL-NEXT: vmovddup {{.*#+}} ymm4 = ymm1[0,0,2,2] -; ALL-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3] -; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3] -; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1 -; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] -; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] -; ALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10> ret <8 x double> %shuffle @@ -731,9 +577,8 @@ define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00000010: ; ALL: # BB#0: -; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0> ret <8 x i64> %shuffle @@ -742,9 +587,8 @@ define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00000200: ; ALL: # BB#0: -; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0> ret <8 x i64> %shuffle @@ -753,9 +597,8 @@ define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00003000: ; ALL: # BB#0: -; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0> ret <8 x i64> %shuffle @@ -764,11 +607,8 @@ define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00040000: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 -; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 -; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0> ret <8 x i64> %shuffle @@ -777,11 +617,8 @@ define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00500000: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0] -; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i64> %shuffle @@ -790,11 +627,8 @@ define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_06000000: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,0] -; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i64> %shuffle @@ -803,11 +637,11 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_70000000: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,0,0,0] -; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 +; ALL-NEXT: movl $7, %eax +; ALL-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2 +; ALL-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0> ret <8 x i64> %shuffle @@ -816,10 +650,7 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01014545: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vpermq $68, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5> ret <8 x i64> %shuffle @@ -828,9 +659,8 @@ define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00112233: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3> ret <8 x i64> %shuffle @@ -839,9 +669,8 @@ define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00001111: ; ALL: # BB#0: -; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1> ret <8 x i64> %shuffle @@ -850,11 +679,7 @@ define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_81a3c5e7: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vshufpd $170, %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7> ret <8 x i64> %shuffle @@ -863,10 +688,9 @@ define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08080808: ; ALL: # BB#0: -; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 -; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8> ret <8 x i64> %shuffle @@ -875,15 +699,9 @@ define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08084c4c: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; ALL-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2 -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 -; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 -; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12> ret <8 x i64> %shuffle @@ -892,13 +710,9 @@ define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_8823cc67: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; ALL-NEXT: vpbroadcastq %xmm3, %ymm3 -; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; ALL-NEXT: vpbroadcastq %xmm1, %ymm1 -; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7> ret <8 x i64> %shuffle @@ -907,13 +721,9 @@ define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_9832dc76: ; ALL: # BB#0: -; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6> ret <8 x i64> %shuffle @@ -922,13 +732,9 @@ define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_9810dc54: ; ALL: # BB#0: -; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2 -; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4> ret <8 x i64> %shuffle @@ -937,15 +743,9 @@ define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08194c5d: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3] -; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13> ret <8 x i64> %shuffle @@ -954,15 +754,9 @@ define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_2a3b6e7f: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] -; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] -; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15> ret <8 x i64> %shuffle @@ -971,13 +765,9 @@ define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08192a3b: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,2,3] -; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3] -; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> ret <8 x i64> %shuffle @@ -986,11 +776,9 @@ define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_08991abb: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,0,1,1] -; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,3] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11> ret <8 x i64> %shuffle @@ -999,12 +787,9 @@ define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_091b2d3f: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3] -; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15> ret <8 x i64> %shuffle @@ -1013,11 +798,9 @@ define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_09ab1def: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7] -; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15> ret <8 x i64> %shuffle @@ -1026,10 +809,7 @@ define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00014445: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq $64, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5> ret <8 x i64> %shuffle @@ -1038,10 +818,7 @@ define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00204464: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq $32, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4> ret <8 x i64> %shuffle @@ -1050,10 +827,7 @@ define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_03004744: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq $12, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4> ret <8 x i64> %shuffle @@ -1062,10 +836,7 @@ define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10005444: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq $1, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4> ret <8 x i64> %shuffle @@ -1074,10 +845,7 @@ define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_22006644: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq $10, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4> ret <8 x i64> %shuffle @@ -1086,10 +854,7 @@ define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_33307774: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq $63, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4> ret <8 x i64> %shuffle @@ -1098,10 +863,7 @@ define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_32107654: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermq $27, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> ret <8 x i64> %shuffle @@ -1110,10 +872,7 @@ define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00234467: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $136, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7> ret <8 x i64> %shuffle @@ -1122,10 +881,7 @@ define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00224466: ; ALL: # BB#0: -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $0, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> ret <8 x i64> %shuffle @@ -1134,10 +890,7 @@ define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10325476: ; ALL: # BB#0: -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $85, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> ret <8 x i64> %shuffle @@ -1146,10 +899,7 @@ define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_11335577: ; ALL: # BB#0: -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $255, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> ret <8 x i64> %shuffle @@ -1158,10 +908,7 @@ define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10235467: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,3] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,3] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $153, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> ret <8 x i64> %shuffle @@ -1170,10 +917,7 @@ define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10225466: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,2] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,2] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $17, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6> ret <8 x i64> %shuffle @@ -1182,10 +926,8 @@ define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00015444: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4> ret <8 x i64> %shuffle @@ -1194,10 +936,8 @@ define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00204644: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4> ret <8 x i64> %shuffle @@ -1206,10 +946,8 @@ define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_03004474: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,3,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4> ret <8 x i64> %shuffle @@ -1218,10 +956,8 @@ define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10004444: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpbroadcastq %xmm0, %ymm0 -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> ret <8 x i64> %shuffle @@ -1230,10 +966,8 @@ define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_22006446: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,0,2] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6> ret <8 x i64> %shuffle @@ -1242,10 +976,8 @@ define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_33307474: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,3,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4> ret <8 x i64> %shuffle @@ -1254,9 +986,8 @@ define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_32104567: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7> ret <8 x i64> %shuffle @@ -1265,10 +996,8 @@ define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00236744: ; ALL: # BB#0: -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4> ret <8 x i64> %shuffle @@ -1277,10 +1006,8 @@ define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00226644: ; ALL: # BB#0: -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4> ret <8 x i64> %shuffle @@ -1289,9 +1016,7 @@ define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10324567: ; ALL: # BB#0: -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $165, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7> ret <8 x i64> %shuffle @@ -1300,9 +1025,7 @@ define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_11334567: ; ALL: # BB#0: -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $175, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7> ret <8 x i64> %shuffle @@ -1311,9 +1034,7 @@ define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01235467: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,3] -; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd $154, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7> ret <8 x i64> %shuffle @@ -1322,9 +1043,7 @@ define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_01235466: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,2] -; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd $26, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6> ret <8 x i64> %shuffle @@ -1333,10 +1052,8 @@ define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_002u6u44: ; ALL: # BB#0: -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,0,0] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4> ret <8 x i64> %shuffle @@ -1345,10 +1062,8 @@ define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00uu66uu: ; ALL: # BB#0: -; ALL-NEXT: vpbroadcastq %xmm0, %ymm1 -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1 +; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef> ret <8 x i64> %shuffle @@ -1357,9 +1072,7 @@ define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_103245uu: ; ALL: # BB#0: -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $37, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef> ret <8 x i64> %shuffle @@ -1368,9 +1081,7 @@ define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_1133uu67: ; ALL: # BB#0: -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7] -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; ALL-NEXT: vpermilpd $143, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7> ret <8 x i64> %shuffle @@ -1379,9 +1090,7 @@ define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_0uu354uu: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] -; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd $24, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef> ret <8 x i64> %shuffle @@ -1390,9 +1099,7 @@ define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_uuu3uu66: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] -; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; ALL-NEXT: vpermilpd $8, %zmm0, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6> ret <8 x i64> %shuffle @@ -1401,15 +1108,9 @@ define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_6caa87e5: ; ALL: # BB#0: -; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] -; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] -; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] -; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; ALL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2 +; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; ALL-NEXT: vmovaps %zmm2, %zmm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5> ret <8 x i64> %shuffle diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll new file mode 100644 index 000000000000..422fe052d38b --- /dev/null +++ b/test/CodeGen/X86/vector-tzcnt-128.ll @@ -0,0 +1,1788 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <2 x i64> @testv2i64(<2 x i64> %in) { +; SSE2-LABEL: testv2i64: +; SSE2: # BB#0: +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: bsfq %rax, %rax +; SSE2-NEXT: movl $64, %ecx +; SSE2-NEXT: cmoveq %rcx, %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: bsfq %rax, %rax +; SSE2-NEXT: cmoveq %rcx, %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv2i64: +; SSE3: # BB#0: +; SSE3-NEXT: movd %xmm0, %rax +; SSE3-NEXT: bsfq %rax, %rax +; SSE3-NEXT: movl $64, %ecx +; SSE3-NEXT: cmoveq %rcx, %rax +; SSE3-NEXT: movd %rax, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE3-NEXT: movd %xmm0, %rax +; SSE3-NEXT: bsfq %rax, %rax +; SSE3-NEXT: cmoveq %rcx, %rax +; SSE3-NEXT: movd %rax, %xmm0 +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv2i64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: bsfq %rax, %rax +; SSSE3-NEXT: movl $64, %ecx +; SSSE3-NEXT: cmoveq %rcx, %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: bsfq %rax, %rax +; SSSE3-NEXT: cmoveq %rcx, %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv2i64: +; SSE41: # BB#0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: bsfq %rax, %rax +; SSE41-NEXT: movl $64, %ecx +; SSE41-NEXT: cmoveq %rcx, %rax +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: bsfq %rax, %rax +; SSE41-NEXT: cmoveq %rcx, %rax +; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: testv2i64: +; AVX: # BB#0: +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: bsfq %rax, %rax +; AVX-NEXT: movl $64, %ecx +; AVX-NEXT: cmoveq %rcx, %rax +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: bsfq %rax, %rax +; AVX-NEXT: cmoveq %rcx, %rax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 0) + ret <2 x i64> %out +} + +define <2 x i64> @testv2i64u(<2 x i64> %in) { +; SSE2-LABEL: testv2i64u: +; SSE2: # BB#0: +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: bsfq %rax, %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %rax +; SSE2-NEXT: bsfq %rax, %rax +; SSE2-NEXT: movd %rax, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv2i64u: +; SSE3: # BB#0: +; SSE3-NEXT: movd %xmm0, %rax +; SSE3-NEXT: bsfq %rax, %rax +; SSE3-NEXT: movd %rax, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE3-NEXT: movd %xmm0, %rax +; SSE3-NEXT: bsfq %rax, %rax +; SSE3-NEXT: movd %rax, %xmm0 +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv2i64u: +; SSSE3: # BB#0: +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: bsfq %rax, %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %rax +; SSSE3-NEXT: bsfq %rax, %rax +; SSSE3-NEXT: movd %rax, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv2i64u: +; SSE41: # BB#0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: bsfq %rax, %rax +; SSE41-NEXT: movd %rax, %xmm1 +; SSE41-NEXT: movd %xmm0, %rax +; SSE41-NEXT: bsfq %rax, %rax +; SSE41-NEXT: movd %rax, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq +; +; AVX-LABEL: testv2i64u: +; AVX: # BB#0: +; AVX-NEXT: vpextrq $1, %xmm0, %rax +; AVX-NEXT: bsfq %rax, %rax +; AVX-NEXT: vmovq %rax, %xmm1 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: bsfq %rax, %rax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq + %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %in, i1 -1) + ret <2 x i64> %out +} + +define <4 x i32> @testv4i32(<4 x i32> %in) { +; SSE2-LABEL: testv4i32: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: bsfl %eax, %eax +; SSE2-NEXT: movl $32, %ecx +; SSE2-NEXT: cmovel %ecx, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: bsfl %eax, %eax +; SSE2-NEXT: cmovel %ecx, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: bsfl %eax, %eax +; SSE2-NEXT: cmovel %ecx, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: bsfl %eax, %eax +; SSE2-NEXT: cmovel %ecx, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv4i32: +; SSE3: # BB#0: +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSE3-NEXT: movd %xmm1, %eax +; SSE3-NEXT: bsfl %eax, %eax +; SSE3-NEXT: movl $32, %ecx +; SSE3-NEXT: cmovel %ecx, %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE3-NEXT: movd %xmm2, %eax +; SSE3-NEXT: bsfl %eax, %eax +; SSE3-NEXT: cmovel %ecx, %eax +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: bsfl %eax, %eax +; SSE3-NEXT: cmovel %ecx, %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: bsfl %eax, %eax +; SSE3-NEXT: cmovel %ecx, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv4i32: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: bsfl %eax, %eax +; SSSE3-NEXT: movl $32, %ecx +; SSSE3-NEXT: cmovel %ecx, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: bsfl %eax, %eax +; SSSE3-NEXT: cmovel %ecx, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: bsfl %eax, %eax +; SSSE3-NEXT: cmovel %ecx, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: bsfl %eax, %eax +; SSSE3-NEXT: cmovel %ecx, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv4i32: +; SSE41: # BB#0: +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: movl $32, %ecx +; SSE41-NEXT: cmovel %ecx, %eax +; SSE41-NEXT: movd %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: movd %edx, %xmm1 +; SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; SSE41-NEXT: pextrd $2, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: cmovel %ecx, %eax +; SSE41-NEXT: pinsrd $2, %eax, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: cmovel %ecx, %eax +; SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv4i32: +; AVX: # BB#0: +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: movl $32, %ecx +; AVX-NEXT: cmovel %ecx, %eax +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vmovd %edx, %xmm1 +; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: cmovel %ecx, %eax +; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $3, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: cmovel %ecx, %eax +; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 0) + ret <4 x i32> %out +} + +define <4 x i32> @testv4i32u(<4 x i32> %in) { +; SSE2-LABEL: testv4i32u: +; SSE2: # BB#0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: bsfl %eax, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE2-NEXT: movd %xmm2, %eax +; SSE2-NEXT: bsfl %eax, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: bsfl %eax, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: bsfl %eax, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv4i32u: +; SSE3: # BB#0: +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSE3-NEXT: movd %xmm1, %eax +; SSE3-NEXT: bsfl %eax, %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSE3-NEXT: movd %xmm2, %eax +; SSE3-NEXT: bsfl %eax, %eax +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: bsfl %eax, %eax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: bsfl %eax, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE3-NEXT: movdqa %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv4i32u: +; SSSE3: # BB#0: +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; SSSE3-NEXT: movd %xmm1, %eax +; SSSE3-NEXT: bsfl %eax, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] +; SSSE3-NEXT: movd %xmm2, %eax +; SSSE3-NEXT: bsfl %eax, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: bsfl %eax, %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: bsfl %eax, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv4i32u: +; SSE41: # BB#0: +; SSE41-NEXT: pextrd $1, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: movd %xmm0, %ecx +; SSE41-NEXT: bsfl %ecx, %ecx +; SSE41-NEXT: movd %ecx, %xmm1 +; SSE41-NEXT: pinsrd $1, %eax, %xmm1 +; SSE41-NEXT: pextrd $2, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrd $2, %eax, %xmm1 +; SSE41-NEXT: pextrd $3, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv4i32u: +; AVX: # BB#0: +; AVX-NEXT: vpextrd $1, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: bsfl %ecx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $2, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrd $3, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %in, i1 -1) + ret <4 x i32> %out +} + +define <8 x i16> @testv8i16(<8 x i16> %in) { +; SSE2-LABEL: testv8i16: +; SSE2: # BB#0: +; SSE2-NEXT: pextrw $7, %xmm0, %eax +; SSE2-NEXT: bsfw %ax, %cx +; SSE2-NEXT: movw $16, %ax +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: pextrw $3, %xmm0, %ecx +; SSE2-NEXT: bsfw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: pextrw $5, %xmm0, %ecx +; SSE2-NEXT: bsfw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: pextrw $1, %xmm0, %ecx +; SSE2-NEXT: bsfw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: pextrw $6, %xmm0, %ecx +; SSE2-NEXT: bsfw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: pextrw $2, %xmm0, %ecx +; SSE2-NEXT: bsfw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pextrw $4, %xmm0, %ecx +; SSE2-NEXT: bsfw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: bsfw %cx, %cx +; SSE2-NEXT: cmovew %ax, %cx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv8i16: +; SSE3: # BB#0: +; SSE3-NEXT: pextrw $7, %xmm0, %eax +; SSE3-NEXT: bsfw %ax, %cx +; SSE3-NEXT: movw $16, %ax +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: movd %ecx, %xmm1 +; SSE3-NEXT: pextrw $3, %xmm0, %ecx +; SSE3-NEXT: bsfw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: pextrw $5, %xmm0, %ecx +; SSE3-NEXT: bsfw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: movd %ecx, %xmm3 +; SSE3-NEXT: pextrw $1, %xmm0, %ecx +; SSE3-NEXT: bsfw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: movd %ecx, %xmm1 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE3-NEXT: pextrw $6, %xmm0, %ecx +; SSE3-NEXT: bsfw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: pextrw $2, %xmm0, %ecx +; SSE3-NEXT: bsfw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: movd %ecx, %xmm3 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE3-NEXT: pextrw $4, %xmm0, %ecx +; SSE3-NEXT: bsfw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: movd %xmm0, %ecx +; SSE3-NEXT: bsfw %cx, %cx +; SSE3-NEXT: cmovew %ax, %cx +; SSE3-NEXT: movd %ecx, %xmm0 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv8i16: +; SSSE3: # BB#0: +; SSSE3-NEXT: pextrw $7, %xmm0, %eax +; SSSE3-NEXT: bsfw %ax, %cx +; SSSE3-NEXT: movw $16, %ax +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: pextrw $3, %xmm0, %ecx +; SSSE3-NEXT: bsfw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: pextrw $5, %xmm0, %ecx +; SSSE3-NEXT: bsfw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: pextrw $1, %xmm0, %ecx +; SSSE3-NEXT: bsfw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: pextrw $6, %xmm0, %ecx +; SSSE3-NEXT: bsfw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: pextrw $2, %xmm0, %ecx +; SSSE3-NEXT: bsfw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: pextrw $4, %xmm0, %ecx +; SSSE3-NEXT: bsfw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: movd %xmm0, %ecx +; SSSE3-NEXT: bsfw %cx, %cx +; SSSE3-NEXT: cmovew %ax, %cx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv8i16: +; SSE41: # BB#0: +; SSE41-NEXT: pextrw $1, %xmm0, %eax +; SSE41-NEXT: bsfw %ax, %cx +; SSE41-NEXT: movw $16, %ax +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: movd %xmm0, %edx +; SSE41-NEXT: bsfw %dx, %dx +; SSE41-NEXT: cmovew %ax, %dx +; SSE41-NEXT: movd %edx, %xmm1 +; SSE41-NEXT: pinsrw $1, %ecx, %xmm1 +; SSE41-NEXT: pextrw $2, %xmm0, %ecx +; SSE41-NEXT: bsfw %cx, %cx +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: pinsrw $2, %ecx, %xmm1 +; SSE41-NEXT: pextrw $3, %xmm0, %ecx +; SSE41-NEXT: bsfw %cx, %cx +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: pinsrw $3, %ecx, %xmm1 +; SSE41-NEXT: pextrw $4, %xmm0, %ecx +; SSE41-NEXT: bsfw %cx, %cx +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: pinsrw $4, %ecx, %xmm1 +; SSE41-NEXT: pextrw $5, %xmm0, %ecx +; SSE41-NEXT: bsfw %cx, %cx +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: pinsrw $5, %ecx, %xmm1 +; SSE41-NEXT: pextrw $6, %xmm0, %ecx +; SSE41-NEXT: bsfw %cx, %cx +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: pinsrw $6, %ecx, %xmm1 +; SSE41-NEXT: pextrw $7, %xmm0, %ecx +; SSE41-NEXT: bsfw %cx, %cx +; SSE41-NEXT: cmovew %ax, %cx +; SSE41-NEXT: pinsrw $7, %ecx, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv8i16: +; AVX: # BB#0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: bsfw %ax, %cx +; AVX-NEXT: movw $16, %ax +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: vmovd %xmm0, %edx +; AVX-NEXT: bsfw %dx, %dx +; AVX-NEXT: cmovew %ax, %dx +; AVX-NEXT: vmovd %edx, %xmm1 +; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %ecx +; AVX-NEXT: bsfw %cx, %cx +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %ecx +; AVX-NEXT: bsfw %cx, %cx +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $4, %xmm0, %ecx +; AVX-NEXT: bsfw %cx, %cx +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $5, %xmm0, %ecx +; AVX-NEXT: bsfw %cx, %cx +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $6, %xmm0, %ecx +; AVX-NEXT: bsfw %cx, %cx +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $7, %xmm0, %ecx +; AVX-NEXT: bsfw %cx, %cx +; AVX-NEXT: cmovew %ax, %cx +; AVX-NEXT: vpinsrw $7, %ecx, %xmm1, %xmm0 +; AVX-NEXT: retq + %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 0) + ret <8 x i16> %out +} + +define <8 x i16> @testv8i16u(<8 x i16> %in) { +; SSE2-LABEL: testv8i16u: +; SSE2: # BB#0: +; SSE2-NEXT: pextrw $7, %xmm0, %eax +; SSE2-NEXT: bsfw %ax, %ax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: bsfw %ax, %ax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: pextrw $5, %xmm0, %eax +; SSE2-NEXT: bsfw %ax, %ax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pextrw $1, %xmm0, %eax +; SSE2-NEXT: bsfw %ax, %ax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pextrw $6, %xmm0, %eax +; SSE2-NEXT: bsfw %ax, %ax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pextrw $2, %xmm0, %eax +; SSE2-NEXT: bsfw %ax, %ax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: pextrw $4, %xmm0, %eax +; SSE2-NEXT: bsfw %ax, %ax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: bsfw %ax, %ax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv8i16u: +; SSE3: # BB#0: +; SSE3-NEXT: pextrw $7, %xmm0, %eax +; SSE3-NEXT: bsfw %ax, %ax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: pextrw $3, %xmm0, %eax +; SSE3-NEXT: bsfw %ax, %ax +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: pextrw $5, %xmm0, %eax +; SSE3-NEXT: bsfw %ax, %ax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: pextrw $1, %xmm0, %eax +; SSE3-NEXT: bsfw %ax, %ax +; SSE3-NEXT: movd %eax, %xmm3 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE3-NEXT: pextrw $6, %xmm0, %eax +; SSE3-NEXT: bsfw %ax, %ax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: pextrw $2, %xmm0, %eax +; SSE3-NEXT: bsfw %ax, %ax +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE3-NEXT: pextrw $4, %xmm0, %eax +; SSE3-NEXT: bsfw %ax, %ax +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: bsfw %ax, %ax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv8i16u: +; SSSE3: # BB#0: +; SSSE3-NEXT: pextrw $7, %xmm0, %eax +; SSSE3-NEXT: bsfw %ax, %ax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pextrw $3, %xmm0, %eax +; SSSE3-NEXT: bsfw %ax, %ax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: pextrw $5, %xmm0, %eax +; SSSE3-NEXT: bsfw %ax, %ax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pextrw $1, %xmm0, %eax +; SSSE3-NEXT: bsfw %ax, %ax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: pextrw $6, %xmm0, %eax +; SSSE3-NEXT: bsfw %ax, %ax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: pextrw $2, %xmm0, %eax +; SSSE3-NEXT: bsfw %ax, %ax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: pextrw $4, %xmm0, %eax +; SSSE3-NEXT: bsfw %ax, %ax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movd %xmm0, %eax +; SSSE3-NEXT: bsfw %ax, %ax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv8i16u: +; SSE41: # BB#0: +; SSE41-NEXT: pextrw $1, %xmm0, %eax +; SSE41-NEXT: bsfw %ax, %ax +; SSE41-NEXT: movd %xmm0, %ecx +; SSE41-NEXT: bsfw %cx, %cx +; SSE41-NEXT: movd %ecx, %xmm1 +; SSE41-NEXT: pinsrw $1, %eax, %xmm1 +; SSE41-NEXT: pextrw $2, %xmm0, %eax +; SSE41-NEXT: bsfw %ax, %ax +; SSE41-NEXT: pinsrw $2, %eax, %xmm1 +; SSE41-NEXT: pextrw $3, %xmm0, %eax +; SSE41-NEXT: bsfw %ax, %ax +; SSE41-NEXT: pinsrw $3, %eax, %xmm1 +; SSE41-NEXT: pextrw $4, %xmm0, %eax +; SSE41-NEXT: bsfw %ax, %ax +; SSE41-NEXT: pinsrw $4, %eax, %xmm1 +; SSE41-NEXT: pextrw $5, %xmm0, %eax +; SSE41-NEXT: bsfw %ax, %ax +; SSE41-NEXT: pinsrw $5, %eax, %xmm1 +; SSE41-NEXT: pextrw $6, %xmm0, %eax +; SSE41-NEXT: bsfw %ax, %ax +; SSE41-NEXT: pinsrw $6, %eax, %xmm1 +; SSE41-NEXT: pextrw $7, %xmm0, %eax +; SSE41-NEXT: bsfw %ax, %ax +; SSE41-NEXT: pinsrw $7, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv8i16u: +; AVX: # BB#0: +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: bsfw %ax, %ax +; AVX-NEXT: vmovd %xmm0, %ecx +; AVX-NEXT: bsfw %cx, %cx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: bsfw %ax, %ax +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: bsfw %ax, %ax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $4, %xmm0, %eax +; AVX-NEXT: bsfw %ax, %ax +; AVX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $5, %xmm0, %eax +; AVX-NEXT: bsfw %ax, %ax +; AVX-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $6, %xmm0, %eax +; AVX-NEXT: bsfw %ax, %ax +; AVX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $7, %xmm0, %eax +; AVX-NEXT: bsfw %ax, %ax +; AVX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %in, i1 -1) + ret <8 x i16> %out +} + +define <16 x i8> @testv16i8(<16 x i8> %in) { +; SSE2-LABEL: testv16i8: +; SSE2: # BB#0: +; SSE2: pushq %rbp +; SSE2: pushq %r14 +; SSE2: pushq %rbx +; SSE2: movaps %xmm0, -16(%rsp) +; SSE2-NEXT: movzbl -1(%rsp), %eax +; SSE2-NEXT: bsfl %eax, %edx +; SSE2-NEXT: movl $32, %eax +; SSE2-NEXT: cmovel %eax, %edx +; SSE2-NEXT: cmpl $32, %edx +; SSE2-NEXT: movl $8, %ecx +; SSE2-NEXT: cmovel %ecx, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movzbl -2(%rsp), %r14d +; SSE2-NEXT: movzbl -3(%rsp), %ebx +; SSE2-NEXT: movzbl -4(%rsp), %r9d +; SSE2-NEXT: movzbl -5(%rsp), %edi +; SSE2-NEXT: movzbl -6(%rsp), %r11d +; SSE2-NEXT: movzbl -7(%rsp), %edx +; SSE2-NEXT: movzbl -8(%rsp), %r8d +; SSE2-NEXT: movzbl -9(%rsp), %esi +; SSE2-NEXT: bsfl %esi, %esi +; SSE2-NEXT: cmovel %eax, %esi +; SSE2-NEXT: cmpl $32, %esi +; SSE2-NEXT: cmovel %ecx, %esi +; SSE2-NEXT: movd %esi, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: bsfl %edi, %esi +; SSE2-NEXT: cmovel %eax, %esi +; SSE2-NEXT: cmpl $32, %esi +; SSE2-NEXT: cmovel %ecx, %esi +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: movzbl -10(%rsp), %edi +; SSE2-NEXT: movzbl -11(%rsp), %esi +; SSE2-NEXT: movzbl -12(%rsp), %r10d +; SSE2-NEXT: movzbl -13(%rsp), %ebp +; SSE2-NEXT: bsfl %ebp, %ebp +; SSE2-NEXT: cmovel %eax, %ebp +; SSE2-NEXT: cmpl $32, %ebp +; SSE2-NEXT: cmovel %ecx, %ebp +; SSE2-NEXT: movd %ebp, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: bsfl %ebx, %ebx +; SSE2-NEXT: cmovel %eax, %ebx +; SSE2-NEXT: cmpl $32, %ebx +; SSE2-NEXT: cmovel %ecx, %ebx +; SSE2-NEXT: movd %ebx, %xmm1 +; SSE2-NEXT: bsfl %esi, %esi +; SSE2-NEXT: cmovel %eax, %esi +; SSE2-NEXT: cmpl $32, %esi +; SSE2-NEXT: cmovel %ecx, %esi +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: bsfl %edx, %edx +; SSE2-NEXT: cmovel %eax, %edx +; SSE2-NEXT: cmpl $32, %edx +; SSE2-NEXT: cmovel %ecx, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: movzbl -14(%rsp), %edx +; SSE2-NEXT: movzbl -15(%rsp), %esi +; SSE2-NEXT: bsfl %esi, %esi +; SSE2-NEXT: cmovel %eax, %esi +; SSE2-NEXT: cmpl $32, %esi +; SSE2-NEXT: cmovel %ecx, %esi +; SSE2-NEXT: movd %esi, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: bsfl %r14d, %esi +; SSE2-NEXT: cmovel %eax, %esi +; SSE2-NEXT: cmpl $32, %esi +; SSE2-NEXT: cmovel %ecx, %esi +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: bsfl %edi, %esi +; SSE2-NEXT: cmovel %eax, %esi +; SSE2-NEXT: cmpl $32, %esi +; SSE2-NEXT: cmovel %ecx, %esi +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: bsfl %r11d, %esi +; SSE2-NEXT: cmovel %eax, %esi +; SSE2-NEXT: cmpl $32, %esi +; SSE2-NEXT: cmovel %ecx, %esi +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: bsfl %edx, %edx +; SSE2-NEXT: cmovel %eax, %edx +; SSE2-NEXT: cmpl $32, %edx +; SSE2-NEXT: cmovel %ecx, %edx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: bsfl %r9d, %edx +; SSE2-NEXT: cmovel %eax, %edx +; SSE2-NEXT: cmpl $32, %edx +; SSE2-NEXT: cmovel %ecx, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: bsfl %r10d, %edx +; SSE2-NEXT: cmovel %eax, %edx +; SSE2-NEXT: cmpl $32, %edx +; SSE2-NEXT: cmovel %ecx, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: bsfl %r8d, %edx +; SSE2-NEXT: cmovel %eax, %edx +; SSE2-NEXT: cmpl $32, %edx +; SSE2-NEXT: cmovel %ecx, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: movzbl -16(%rsp), %edx +; SSE2-NEXT: bsfl %edx, %edx +; SSE2-NEXT: cmovel %eax, %edx +; SSE2-NEXT: cmpl $32, %edx +; SSE2-NEXT: cmovel %ecx, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv16i8: +; SSE3: # BB#0: +; SSE3: pushq %rbp +; SSE3: pushq %r14 +; SSE3: pushq %rbx +; SSE3: movaps %xmm0, -16(%rsp) +; SSE3-NEXT: movzbl -1(%rsp), %eax +; SSE3-NEXT: bsfl %eax, %edx +; SSE3-NEXT: movl $32, %eax +; SSE3-NEXT: cmovel %eax, %edx +; SSE3-NEXT: cmpl $32, %edx +; SSE3-NEXT: movl $8, %ecx +; SSE3-NEXT: cmovel %ecx, %edx +; SSE3-NEXT: movd %edx, %xmm0 +; SSE3-NEXT: movzbl -2(%rsp), %r14d +; SSE3-NEXT: movzbl -3(%rsp), %ebx +; SSE3-NEXT: movzbl -4(%rsp), %r9d +; SSE3-NEXT: movzbl -5(%rsp), %edi +; SSE3-NEXT: movzbl -6(%rsp), %r11d +; SSE3-NEXT: movzbl -7(%rsp), %edx +; SSE3-NEXT: movzbl -8(%rsp), %r8d +; SSE3-NEXT: movzbl -9(%rsp), %esi +; SSE3-NEXT: bsfl %esi, %esi +; SSE3-NEXT: cmovel %eax, %esi +; SSE3-NEXT: cmpl $32, %esi +; SSE3-NEXT: cmovel %ecx, %esi +; SSE3-NEXT: movd %esi, %xmm1 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE3-NEXT: bsfl %edi, %esi +; SSE3-NEXT: cmovel %eax, %esi +; SSE3-NEXT: cmpl $32, %esi +; SSE3-NEXT: cmovel %ecx, %esi +; SSE3-NEXT: movd %esi, %xmm2 +; SSE3-NEXT: movzbl -10(%rsp), %edi +; SSE3-NEXT: movzbl -11(%rsp), %esi +; SSE3-NEXT: movzbl -12(%rsp), %r10d +; SSE3-NEXT: movzbl -13(%rsp), %ebp +; SSE3-NEXT: bsfl %ebp, %ebp +; SSE3-NEXT: cmovel %eax, %ebp +; SSE3-NEXT: cmpl $32, %ebp +; SSE3-NEXT: cmovel %ecx, %ebp +; SSE3-NEXT: movd %ebp, %xmm0 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE3-NEXT: bsfl %ebx, %ebx +; SSE3-NEXT: cmovel %eax, %ebx +; SSE3-NEXT: cmpl $32, %ebx +; SSE3-NEXT: cmovel %ecx, %ebx +; SSE3-NEXT: movd %ebx, %xmm1 +; SSE3-NEXT: bsfl %esi, %esi +; SSE3-NEXT: cmovel %eax, %esi +; SSE3-NEXT: cmpl $32, %esi +; SSE3-NEXT: cmovel %ecx, %esi +; SSE3-NEXT: movd %esi, %xmm2 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE3-NEXT: bsfl %edx, %edx +; SSE3-NEXT: cmovel %eax, %edx +; SSE3-NEXT: cmpl $32, %edx +; SSE3-NEXT: cmovel %ecx, %edx +; SSE3-NEXT: movd %edx, %xmm3 +; SSE3-NEXT: movzbl -14(%rsp), %edx +; SSE3-NEXT: movzbl -15(%rsp), %esi +; SSE3-NEXT: bsfl %esi, %esi +; SSE3-NEXT: cmovel %eax, %esi +; SSE3-NEXT: cmpl $32, %esi +; SSE3-NEXT: cmovel %ecx, %esi +; SSE3-NEXT: movd %esi, %xmm1 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE3-NEXT: bsfl %r14d, %esi +; SSE3-NEXT: cmovel %eax, %esi +; SSE3-NEXT: cmpl $32, %esi +; SSE3-NEXT: cmovel %ecx, %esi +; SSE3-NEXT: movd %esi, %xmm0 +; SSE3-NEXT: bsfl %edi, %esi +; SSE3-NEXT: cmovel %eax, %esi +; SSE3-NEXT: cmpl $32, %esi +; SSE3-NEXT: cmovel %ecx, %esi +; SSE3-NEXT: movd %esi, %xmm3 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE3-NEXT: bsfl %r11d, %esi +; SSE3-NEXT: cmovel %eax, %esi +; SSE3-NEXT: cmpl $32, %esi +; SSE3-NEXT: cmovel %ecx, %esi +; SSE3-NEXT: movd %esi, %xmm0 +; SSE3-NEXT: bsfl %edx, %edx +; SSE3-NEXT: cmovel %eax, %edx +; SSE3-NEXT: cmpl $32, %edx +; SSE3-NEXT: cmovel %ecx, %edx +; SSE3-NEXT: movd %edx, %xmm2 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE3-NEXT: bsfl %r9d, %edx +; SSE3-NEXT: cmovel %eax, %edx +; SSE3-NEXT: cmpl $32, %edx +; SSE3-NEXT: cmovel %ecx, %edx +; SSE3-NEXT: movd %edx, %xmm0 +; SSE3-NEXT: bsfl %r10d, %edx +; SSE3-NEXT: cmovel %eax, %edx +; SSE3-NEXT: cmpl $32, %edx +; SSE3-NEXT: cmovel %ecx, %edx +; SSE3-NEXT: movd %edx, %xmm3 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE3-NEXT: bsfl %r8d, %edx +; SSE3-NEXT: cmovel %eax, %edx +; SSE3-NEXT: cmpl $32, %edx +; SSE3-NEXT: cmovel %ecx, %edx +; SSE3-NEXT: movd %edx, %xmm4 +; SSE3-NEXT: movzbl -16(%rsp), %edx +; SSE3-NEXT: bsfl %edx, %edx +; SSE3-NEXT: cmovel %eax, %edx +; SSE3-NEXT: cmpl $32, %edx +; SSE3-NEXT: cmovel %ecx, %edx +; SSE3-NEXT: movd %edx, %xmm0 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE3-NEXT: popq %rbx +; SSE3-NEXT: popq %r14 +; SSE3-NEXT: popq %rbp +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv16i8: +; SSSE3: # BB#0: +; SSSE3: pushq %rbp +; SSSE3: pushq %r14 +; SSSE3: pushq %rbx +; SSSE3: movaps %xmm0, -16(%rsp) +; SSSE3-NEXT: movzbl -1(%rsp), %eax +; SSSE3-NEXT: bsfl %eax, %edx +; SSSE3-NEXT: movl $32, %eax +; SSSE3-NEXT: cmovel %eax, %edx +; SSSE3-NEXT: cmpl $32, %edx +; SSSE3-NEXT: movl $8, %ecx +; SSSE3-NEXT: cmovel %ecx, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movzbl -2(%rsp), %r14d +; SSSE3-NEXT: movzbl -3(%rsp), %ebx +; SSSE3-NEXT: movzbl -4(%rsp), %r9d +; SSSE3-NEXT: movzbl -5(%rsp), %edi +; SSSE3-NEXT: movzbl -6(%rsp), %r11d +; SSSE3-NEXT: movzbl -7(%rsp), %edx +; SSSE3-NEXT: movzbl -8(%rsp), %r8d +; SSSE3-NEXT: movzbl -9(%rsp), %esi +; SSSE3-NEXT: bsfl %esi, %esi +; SSSE3-NEXT: cmovel %eax, %esi +; SSSE3-NEXT: cmpl $32, %esi +; SSSE3-NEXT: cmovel %ecx, %esi +; SSSE3-NEXT: movd %esi, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: bsfl %edi, %esi +; SSSE3-NEXT: cmovel %eax, %esi +; SSSE3-NEXT: cmpl $32, %esi +; SSSE3-NEXT: cmovel %ecx, %esi +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: movzbl -10(%rsp), %edi +; SSSE3-NEXT: movzbl -11(%rsp), %esi +; SSSE3-NEXT: movzbl -12(%rsp), %r10d +; SSSE3-NEXT: movzbl -13(%rsp), %ebp +; SSSE3-NEXT: bsfl %ebp, %ebp +; SSSE3-NEXT: cmovel %eax, %ebp +; SSSE3-NEXT: cmpl $32, %ebp +; SSSE3-NEXT: cmovel %ecx, %ebp +; SSSE3-NEXT: movd %ebp, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: bsfl %ebx, %ebx +; SSSE3-NEXT: cmovel %eax, %ebx +; SSSE3-NEXT: cmpl $32, %ebx +; SSSE3-NEXT: cmovel %ecx, %ebx +; SSSE3-NEXT: movd %ebx, %xmm1 +; SSSE3-NEXT: bsfl %esi, %esi +; SSSE3-NEXT: cmovel %eax, %esi +; SSSE3-NEXT: cmpl $32, %esi +; SSSE3-NEXT: cmovel %ecx, %esi +; SSSE3-NEXT: movd %esi, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: bsfl %edx, %edx +; SSSE3-NEXT: cmovel %eax, %edx +; SSSE3-NEXT: cmpl $32, %edx +; SSSE3-NEXT: cmovel %ecx, %edx +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: movzbl -14(%rsp), %edx +; SSSE3-NEXT: movzbl -15(%rsp), %esi +; SSSE3-NEXT: bsfl %esi, %esi +; SSSE3-NEXT: cmovel %eax, %esi +; SSSE3-NEXT: cmpl $32, %esi +; SSSE3-NEXT: cmovel %ecx, %esi +; SSSE3-NEXT: movd %esi, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: bsfl %r14d, %esi +; SSSE3-NEXT: cmovel %eax, %esi +; SSSE3-NEXT: cmpl $32, %esi +; SSSE3-NEXT: cmovel %ecx, %esi +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: bsfl %edi, %esi +; SSSE3-NEXT: cmovel %eax, %esi +; SSSE3-NEXT: cmpl $32, %esi +; SSSE3-NEXT: cmovel %ecx, %esi +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: bsfl %r11d, %esi +; SSSE3-NEXT: cmovel %eax, %esi +; SSSE3-NEXT: cmpl $32, %esi +; SSSE3-NEXT: cmovel %ecx, %esi +; SSSE3-NEXT: movd %esi, %xmm0 +; SSSE3-NEXT: bsfl %edx, %edx +; SSSE3-NEXT: cmovel %eax, %edx +; SSSE3-NEXT: cmpl $32, %edx +; SSSE3-NEXT: cmovel %ecx, %edx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSSE3-NEXT: bsfl %r9d, %edx +; SSSE3-NEXT: cmovel %eax, %edx +; SSSE3-NEXT: cmpl $32, %edx +; SSSE3-NEXT: cmovel %ecx, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: bsfl %r10d, %edx +; SSSE3-NEXT: cmovel %eax, %edx +; SSSE3-NEXT: cmpl $32, %edx +; SSSE3-NEXT: cmovel %ecx, %edx +; SSSE3-NEXT: movd %edx, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: bsfl %r8d, %edx +; SSSE3-NEXT: cmovel %eax, %edx +; SSSE3-NEXT: cmpl $32, %edx +; SSSE3-NEXT: cmovel %ecx, %edx +; SSSE3-NEXT: movd %edx, %xmm4 +; SSSE3-NEXT: movzbl -16(%rsp), %edx +; SSSE3-NEXT: bsfl %edx, %edx +; SSSE3-NEXT: cmovel %eax, %edx +; SSSE3-NEXT: cmpl $32, %edx +; SSSE3-NEXT: cmovel %ecx, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: popq %r14 +; SSSE3-NEXT: popq %rbp +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv16i8: +; SSE41: # BB#0: +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %edx +; SSE41-NEXT: movl $32, %eax +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: movl $8, %ecx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pextrb $0, %xmm0, %esi +; SSE41-NEXT: bsfl %esi, %esi +; SSE41-NEXT: cmovel %eax, %esi +; SSE41-NEXT: cmpl $32, %esi +; SSE41-NEXT: cmovel %ecx, %esi +; SSE41-NEXT: movd %esi, %xmm1 +; SSE41-NEXT: pinsrb $1, %edx, %xmm1 +; SSE41-NEXT: pextrb $2, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $2, %edx, %xmm1 +; SSE41-NEXT: pextrb $3, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $3, %edx, %xmm1 +; SSE41-NEXT: pextrb $4, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $4, %edx, %xmm1 +; SSE41-NEXT: pextrb $5, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $5, %edx, %xmm1 +; SSE41-NEXT: pextrb $6, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $6, %edx, %xmm1 +; SSE41-NEXT: pextrb $7, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $7, %edx, %xmm1 +; SSE41-NEXT: pextrb $8, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $8, %edx, %xmm1 +; SSE41-NEXT: pextrb $9, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $9, %edx, %xmm1 +; SSE41-NEXT: pextrb $10, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $10, %edx, %xmm1 +; SSE41-NEXT: pextrb $11, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $11, %edx, %xmm1 +; SSE41-NEXT: pextrb $12, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $12, %edx, %xmm1 +; SSE41-NEXT: pextrb $13, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $13, %edx, %xmm1 +; SSE41-NEXT: pextrb $14, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $14, %edx, %xmm1 +; SSE41-NEXT: pextrb $15, %xmm0, %edx +; SSE41-NEXT: bsfl %edx, %edx +; SSE41-NEXT: cmovel %eax, %edx +; SSE41-NEXT: cmpl $32, %edx +; SSE41-NEXT: cmovel %ecx, %edx +; SSE41-NEXT: pinsrb $15, %edx, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv16i8: +; AVX: # BB#0: +; AVX-NEXT: vpextrb $1, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %edx +; AVX-NEXT: movl $32, %eax +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: movl $8, %ecx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpextrb $0, %xmm0, %esi +; AVX-NEXT: bsfl %esi, %esi +; AVX-NEXT: cmovel %eax, %esi +; AVX-NEXT: cmpl $32, %esi +; AVX-NEXT: cmovel %ecx, %esi +; AVX-NEXT: vmovd %esi, %xmm1 +; AVX-NEXT: vpinsrb $1, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $2, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $3, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $3, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $4, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $5, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $5, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $6, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $7, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $7, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $8, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $8, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $9, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $9, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $10, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $10, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $11, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $11, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $12, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $12, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $13, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $13, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $14, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $15, %xmm0, %edx +; AVX-NEXT: bsfl %edx, %edx +; AVX-NEXT: cmovel %eax, %edx +; AVX-NEXT: cmpl $32, %edx +; AVX-NEXT: cmovel %ecx, %edx +; AVX-NEXT: vpinsrb $15, %edx, %xmm1, %xmm0 +; AVX-NEXT: retq + %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 0) + ret <16 x i8> %out +} + +define <16 x i8> @testv16i8u(<16 x i8> %in) { +; SSE2-LABEL: testv16i8u: +; SSE2: # BB#0: +; SSE2: pushq %rbx +; SSE2: movaps %xmm0, -16(%rsp) +; SSE2-NEXT: movzbl -1(%rsp), %eax +; SSE2-NEXT: bsfl %eax, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl -2(%rsp), %r11d +; SSE2-NEXT: movzbl -3(%rsp), %eax +; SSE2-NEXT: movzbl -4(%rsp), %r9d +; SSE2-NEXT: movzbl -5(%rsp), %edi +; SSE2-NEXT: movzbl -6(%rsp), %r10d +; SSE2-NEXT: movzbl -7(%rsp), %ecx +; SSE2-NEXT: movzbl -8(%rsp), %r8d +; SSE2-NEXT: movzbl -9(%rsp), %edx +; SSE2-NEXT: bsfl %edx, %edx +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: bsfl %edi, %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: movzbl -10(%rsp), %edx +; SSE2-NEXT: movzbl -11(%rsp), %esi +; SSE2-NEXT: movzbl -12(%rsp), %edi +; SSE2-NEXT: movzbl -13(%rsp), %ebx +; SSE2-NEXT: bsfl %ebx, %ebx +; SSE2-NEXT: movd %ebx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: bsfl %eax, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: bsfl %esi, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw %xmm0, %xmm3 # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: bsfl %ecx, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzbl -14(%rsp), %eax +; SSE2-NEXT: movzbl -15(%rsp), %ecx +; SSE2-NEXT: bsfl %ecx, %ecx +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: bsfl %r11d, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: bsfl %edx, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: bsfl %r10d, %ecx +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: bsfl %eax, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: bsfl %r9d, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: bsfl %edi, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: bsfl %r8d, %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movzbl -16(%rsp), %eax +; SSE2-NEXT: bsfl %eax, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: retq +; +; SSE3-LABEL: testv16i8u: +; SSE3: # BB#0: +; SSE3: pushq %rbx +; SSE3: movaps %xmm0, -16(%rsp) +; SSE3-NEXT: movzbl -1(%rsp), %eax +; SSE3-NEXT: bsfl %eax, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: movzbl -2(%rsp), %r11d +; SSE3-NEXT: movzbl -3(%rsp), %eax +; SSE3-NEXT: movzbl -4(%rsp), %r9d +; SSE3-NEXT: movzbl -5(%rsp), %edi +; SSE3-NEXT: movzbl -6(%rsp), %r10d +; SSE3-NEXT: movzbl -7(%rsp), %ecx +; SSE3-NEXT: movzbl -8(%rsp), %r8d +; SSE3-NEXT: movzbl -9(%rsp), %edx +; SSE3-NEXT: bsfl %edx, %edx +; SSE3-NEXT: movd %edx, %xmm1 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE3-NEXT: bsfl %edi, %edx +; SSE3-NEXT: movd %edx, %xmm0 +; SSE3-NEXT: movzbl -10(%rsp), %edx +; SSE3-NEXT: movzbl -11(%rsp), %esi +; SSE3-NEXT: movzbl -12(%rsp), %edi +; SSE3-NEXT: movzbl -13(%rsp), %ebx +; SSE3-NEXT: bsfl %ebx, %ebx +; SSE3-NEXT: movd %ebx, %xmm2 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE3-NEXT: bsfl %eax, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: bsfl %esi, %eax +; SSE3-NEXT: movd %eax, %xmm3 +; SSE3-NEXT: punpcklbw %xmm0, %xmm3 # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE3-NEXT: bsfl %ecx, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: movzbl -14(%rsp), %eax +; SSE3-NEXT: movzbl -15(%rsp), %ecx +; SSE3-NEXT: bsfl %ecx, %ecx +; SSE3-NEXT: movd %ecx, %xmm1 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE3-NEXT: bsfl %r11d, %ecx +; SSE3-NEXT: movd %ecx, %xmm0 +; SSE3-NEXT: bsfl %edx, %ecx +; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE3-NEXT: bsfl %r10d, %ecx +; SSE3-NEXT: movd %ecx, %xmm0 +; SSE3-NEXT: bsfl %eax, %eax +; SSE3-NEXT: movd %eax, %xmm3 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE3-NEXT: bsfl %r9d, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: bsfl %edi, %eax +; SSE3-NEXT: movd %eax, %xmm2 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE3-NEXT: bsfl %r8d, %eax +; SSE3-NEXT: movd %eax, %xmm4 +; SSE3-NEXT: movzbl -16(%rsp), %eax +; SSE3-NEXT: bsfl %eax, %eax +; SSE3-NEXT: movd %eax, %xmm0 +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE3-NEXT: popq %rbx +; SSE3-NEXT: retq +; +; SSSE3-LABEL: testv16i8u: +; SSSE3: # BB#0: +; SSSE3: pushq %rbx +; SSSE3: movaps %xmm0, -16(%rsp) +; SSSE3-NEXT: movzbl -1(%rsp), %eax +; SSSE3-NEXT: bsfl %eax, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl -2(%rsp), %r11d +; SSSE3-NEXT: movzbl -3(%rsp), %eax +; SSSE3-NEXT: movzbl -4(%rsp), %r9d +; SSSE3-NEXT: movzbl -5(%rsp), %edi +; SSSE3-NEXT: movzbl -6(%rsp), %r10d +; SSSE3-NEXT: movzbl -7(%rsp), %ecx +; SSSE3-NEXT: movzbl -8(%rsp), %r8d +; SSSE3-NEXT: movzbl -9(%rsp), %edx +; SSSE3-NEXT: bsfl %edx, %edx +; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: bsfl %edi, %edx +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: movzbl -10(%rsp), %edx +; SSSE3-NEXT: movzbl -11(%rsp), %esi +; SSSE3-NEXT: movzbl -12(%rsp), %edi +; SSSE3-NEXT: movzbl -13(%rsp), %ebx +; SSSE3-NEXT: bsfl %ebx, %ebx +; SSSE3-NEXT: movd %ebx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: bsfl %eax, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: bsfl %esi, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw %xmm0, %xmm3 # xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: bsfl %ecx, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzbl -14(%rsp), %eax +; SSSE3-NEXT: movzbl -15(%rsp), %ecx +; SSSE3-NEXT: bsfl %ecx, %ecx +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: bsfl %r11d, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: bsfl %edx, %ecx +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: bsfl %r10d, %ecx +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: bsfl %eax, %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: bsfl %r9d, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: bsfl %edi, %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSSE3-NEXT: bsfl %r8d, %eax +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movzbl -16(%rsp), %eax +; SSSE3-NEXT: bsfl %eax, %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: popq %rbx +; SSSE3-NEXT: retq +; +; SSE41-LABEL: testv16i8u: +; SSE41: # BB#0: +; SSE41-NEXT: pextrb $1, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pextrb $0, %xmm0, %ecx +; SSE41-NEXT: bsfl %ecx, %ecx +; SSE41-NEXT: movd %ecx, %xmm1 +; SSE41-NEXT: pinsrb $1, %eax, %xmm1 +; SSE41-NEXT: pextrb $2, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $2, %eax, %xmm1 +; SSE41-NEXT: pextrb $3, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $3, %eax, %xmm1 +; SSE41-NEXT: pextrb $4, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $4, %eax, %xmm1 +; SSE41-NEXT: pextrb $5, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $5, %eax, %xmm1 +; SSE41-NEXT: pextrb $6, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $6, %eax, %xmm1 +; SSE41-NEXT: pextrb $7, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $7, %eax, %xmm1 +; SSE41-NEXT: pextrb $8, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $8, %eax, %xmm1 +; SSE41-NEXT: pextrb $9, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $9, %eax, %xmm1 +; SSE41-NEXT: pextrb $10, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $10, %eax, %xmm1 +; SSE41-NEXT: pextrb $11, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $11, %eax, %xmm1 +; SSE41-NEXT: pextrb $12, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $12, %eax, %xmm1 +; SSE41-NEXT: pextrb $13, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $13, %eax, %xmm1 +; SSE41-NEXT: pextrb $14, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $14, %eax, %xmm1 +; SSE41-NEXT: pextrb $15, %xmm0, %eax +; SSE41-NEXT: bsfl %eax, %eax +; SSE41-NEXT: pinsrb $15, %eax, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: testv16i8u: +; AVX: # BB#0: +; AVX-NEXT: vpextrb $1, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpextrb $0, %xmm0, %ecx +; AVX-NEXT: bsfl %ecx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $2, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $3, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $4, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $5, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $6, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $7, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $8, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $9, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $10, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $11, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $12, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $13, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $14, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrb $15, %xmm0, %eax +; AVX-NEXT: bsfl %eax, %eax +; AVX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq + %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %in, i1 -1) + ret <16 x i8> %out +} + +define <2 x i64> @foldv2i64() { +; SSE-LABEL: foldv2i64: +; SSE: # BB#0: +; SSE-NEXT: movl $8, %eax +; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: foldv2i64: +; AVX: # BB#0: +; AVX-NEXT: movl $8, %eax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: retq + %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0) + ret <2 x i64> %out +} + +define <2 x i64> @foldv2i64u() { +; SSE-LABEL: foldv2i64u: +; SSE: # BB#0: +; SSE-NEXT: movl $8, %eax +; SSE-NEXT: movd %rax, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: foldv2i64u: +; AVX: # BB#0: +; AVX-NEXT: movl $8, %eax +; AVX-NEXT: vmovq %rax, %xmm0 +; AVX-NEXT: retq + %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1) + ret <2 x i64> %out +} + +define <4 x i32> @foldv4i32() { +; SSE-LABEL: foldv4i32: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX-NEXT: retq + %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0) + ret <4 x i32> %out +} + +define <4 x i32> @foldv4i32u() { +; SSE-LABEL: foldv4i32u: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv4i32u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0] +; AVX-NEXT: retq + %out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1) + ret <4 x i32> %out +} + +define <8 x i16> @foldv8i16() { +; SSE-LABEL: foldv8i16: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv8i16: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX-NEXT: retq + %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0) + ret <8 x i16> %out +} + +define <8 x i16> @foldv8i16u() { +; SSE-LABEL: foldv8i16u: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv8i16u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3] +; AVX-NEXT: retq + %out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1) + ret <8 x i16> %out +} + +define <16 x i8> @foldv16i8() { +; SSE-LABEL: foldv16i8: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv16i8: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX-NEXT: retq + %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0) + ret <16 x i8> %out +} + +define <16 x i8> @foldv16i8u() { +; SSE-LABEL: foldv16i8u: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; SSE-NEXT: retq +; +; AVX-LABEL: foldv16i8u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5] +; AVX-NEXT: retq + %out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1) + ret <16 x i8> %out +} + +declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) +declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) +declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1) +declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1) diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll new file mode 100644 index 000000000000..8f744f79f85f --- /dev/null +++ b/test/CodeGen/X86/vector-tzcnt-256.ll @@ -0,0 +1,1195 @@ +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 + +target triple = "x86_64-unknown-unknown" + +define <4 x i64> @testv4i64(<4 x i64> %in) { +; AVX1-LABEL: testv4i64: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: bsfq %rax, %rax +; AVX1-NEXT: movl $64, %ecx +; AVX1-NEXT: cmoveq %rcx, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: bsfq %rax, %rax +; AVX1-NEXT: cmoveq %rcx, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: bsfq %rax, %rax +; AVX1-NEXT: cmoveq %rcx, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: bsfq %rax, %rax +; AVX1-NEXT: cmoveq %rcx, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv4i64: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: bsfq %rax, %rax +; AVX2-NEXT: movl $64, %ecx +; AVX2-NEXT: cmoveq %rcx, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: bsfq %rax, %rax +; AVX2-NEXT: cmoveq %rcx, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: bsfq %rax, %rax +; AVX2-NEXT: cmoveq %rcx, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: bsfq %rax, %rax +; AVX2-NEXT: cmoveq %rcx, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 0) + ret <4 x i64> %out +} + +define <4 x i64> @testv4i64u(<4 x i64> %in) { +; AVX1-LABEL: testv4i64u: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: bsfq %rax, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: bsfq %rax, %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vpextrq $1, %xmm0, %rax +; AVX1-NEXT: bsfq %rax, %rax +; AVX1-NEXT: vmovq %rax, %xmm2 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: bsfq %rax, %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv4i64u: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: bsfq %rax, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: bsfq %rax, %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpextrq $1, %xmm0, %rax +; AVX2-NEXT: bsfq %rax, %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: bsfq %rax, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %in, i1 -1) + ret <4 x i64> %out +} + +define <8 x i32> @testv8i32(<8 x i32> %in) { +; AVX1-LABEL: testv8i32: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %ecx +; AVX1-NEXT: movl $32, %eax +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: vmovd %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm1, %ecx +; AVX1-NEXT: bsfl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm1, %ecx +; AVX1-NEXT: bsfl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; AVX1-NEXT: bsfl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: vmovd %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; AVX1-NEXT: bsfl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; AVX1-NEXT: bsfl %ecx, %ecx +; AVX1-NEXT: cmovel %eax, %ecx +; AVX1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i32: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrd $1, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %ecx +; AVX2-NEXT: movl $32, %eax +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: vmovd %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: vmovd %edx, %xmm2 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; AVX2-NEXT: bsfl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $3, %xmm1, %ecx +; AVX2-NEXT: bsfl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm1 +; AVX2-NEXT: vpextrd $1, %xmm0, %ecx +; AVX2-NEXT: bsfl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: vmovd %edx, %xmm2 +; AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $2, %xmm0, %ecx +; AVX2-NEXT: bsfl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; AVX2-NEXT: bsfl %ecx, %ecx +; AVX2-NEXT: cmovel %eax, %ecx +; AVX2-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 0) + ret <8 x i32> %out +} + +define <8 x i32> @testv8i32u(<8 x i32> %in) { +; AVX1-LABEL: testv8i32u: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vmovd %xmm1, %ecx +; AVX1-NEXT: bsfl %ecx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 +; AVX1-NEXT: vpextrd $1, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vmovd %xmm0, %ecx +; AVX1-NEXT: bsfl %ecx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrd $3, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i32u: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrd $1, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vmovd %xmm1, %ecx +; AVX2-NEXT: bsfl %ecx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $2, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $3, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 +; AVX2-NEXT: vpextrd $1, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: bsfl %ecx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $2, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrd $3, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %in, i1 -1) + ret <8 x i32> %out +} + +define <16 x i16> @testv16i16(<16 x i16> %in) { +; AVX1-LABEL: testv16i16: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrw $1, %xmm1, %eax +; AVX1-NEXT: bsfw %ax, %cx +; AVX1-NEXT: movw $16, %ax +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vmovd %xmm1, %edx +; AVX1-NEXT: bsfw %dx, %dx +; AVX1-NEXT: cmovew %ax, %dx +; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $2, %xmm1, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $3, %xmm1, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $4, %xmm1, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $5, %xmm1, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $6, %xmm1, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $7, %xmm1, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1 +; AVX1-NEXT: vpextrw $1, %xmm0, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vmovd %xmm0, %edx +; AVX1-NEXT: bsfw %dx, %dx +; AVX1-NEXT: cmovew %ax, %dx +; AVX1-NEXT: vmovd %edx, %xmm2 +; AVX1-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $2, %xmm0, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $3, %xmm0, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $4, %xmm0, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $5, %xmm0, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $6, %xmm0, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $7, %xmm0, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: cmovew %ax, %cx +; AVX1-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i16: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrw $1, %xmm1, %eax +; AVX2-NEXT: bsfw %ax, %cx +; AVX2-NEXT: movw $16, %ax +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vmovd %xmm1, %edx +; AVX2-NEXT: bsfw %dx, %dx +; AVX2-NEXT: cmovew %ax, %dx +; AVX2-NEXT: vmovd %edx, %xmm2 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $2, %xmm1, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $3, %xmm1, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $4, %xmm1, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $5, %xmm1, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $6, %xmm1, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $7, %xmm1, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm1 +; AVX2-NEXT: vpextrw $1, %xmm0, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vmovd %xmm0, %edx +; AVX2-NEXT: bsfw %dx, %dx +; AVX2-NEXT: cmovew %ax, %dx +; AVX2-NEXT: vmovd %edx, %xmm2 +; AVX2-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $2, %xmm0, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vpinsrw $2, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $3, %xmm0, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $4, %xmm0, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vpinsrw $4, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $5, %xmm0, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $6, %xmm0, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $7, %xmm0, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: cmovew %ax, %cx +; AVX2-NEXT: vpinsrw $7, %ecx, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 0) + ret <16 x i16> %out +} + +define <16 x i16> @testv16i16u(<16 x i16> %in) { +; AVX1-LABEL: testv16i16u: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrw $1, %xmm1, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vmovd %xmm1, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $2, %xmm1, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $3, %xmm1, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $4, %xmm1, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $5, %xmm1, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $6, %xmm1, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $7, %xmm1, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 +; AVX1-NEXT: vpextrw $1, %xmm0, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vmovd %xmm0, %ecx +; AVX1-NEXT: bsfw %cx, %cx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $2, %xmm0, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $3, %xmm0, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $4, %xmm0, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $5, %xmm0, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $6, %xmm0, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrw $7, %xmm0, %eax +; AVX1-NEXT: bsfw %ax, %ax +; AVX1-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i16u: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrw $1, %xmm1, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vmovd %xmm1, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $2, %xmm1, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $3, %xmm1, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $4, %xmm1, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $5, %xmm1, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $6, %xmm1, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $7, %xmm1, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vpinsrw $7, %eax, %xmm2, %xmm1 +; AVX2-NEXT: vpextrw $1, %xmm0, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vmovd %xmm0, %ecx +; AVX2-NEXT: bsfw %cx, %cx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $2, %xmm0, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $3, %xmm0, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $4, %xmm0, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $5, %xmm0, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vpinsrw $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $6, %xmm0, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vpinsrw $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrw $7, %xmm0, %eax +; AVX2-NEXT: bsfw %ax, %ax +; AVX2-NEXT: vpinsrw $7, %eax, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %in, i1 -1) + ret <16 x i16> %out +} + +define <32 x i8> @testv32i8(<32 x i8> %in) { +; AVX1-LABEL: testv32i8: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrb $1, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %edx +; AVX1-NEXT: movl $32, %eax +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: movl $8, %ecx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpextrb $0, %xmm1, %esi +; AVX1-NEXT: bsfl %esi, %esi +; AVX1-NEXT: cmovel %eax, %esi +; AVX1-NEXT: cmpl $32, %esi +; AVX1-NEXT: cmovel %ecx, %esi +; AVX1-NEXT: vmovd %esi, %xmm2 +; AVX1-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $2, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $3, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $4, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $5, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $6, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $7, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $8, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $8, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $9, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $9, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $10, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $10, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $11, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $11, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $12, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $13, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $13, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $14, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $15, %xmm1, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $15, %edx, %xmm2, %xmm1 +; AVX1-NEXT: vpextrb $1, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpextrb $0, %xmm0, %esi +; AVX1-NEXT: bsfl %esi, %esi +; AVX1-NEXT: cmovel %eax, %esi +; AVX1-NEXT: cmpl $32, %esi +; AVX1-NEXT: cmovel %ecx, %esi +; AVX1-NEXT: vmovd %esi, %xmm2 +; AVX1-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $2, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $3, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $4, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $5, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $6, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $7, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $8, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $8, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $9, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $9, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $10, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $10, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $11, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $11, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $12, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $13, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $13, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $14, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $15, %xmm0, %edx +; AVX1-NEXT: bsfl %edx, %edx +; AVX1-NEXT: cmovel %eax, %edx +; AVX1-NEXT: cmpl $32, %edx +; AVX1-NEXT: cmovel %ecx, %edx +; AVX1-NEXT: vpinsrb $15, %edx, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv32i8: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrb $1, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %edx +; AVX2-NEXT: movl $32, %eax +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: movl $8, %ecx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpextrb $0, %xmm1, %esi +; AVX2-NEXT: bsfl %esi, %esi +; AVX2-NEXT: cmovel %eax, %esi +; AVX2-NEXT: cmpl $32, %esi +; AVX2-NEXT: cmovel %ecx, %esi +; AVX2-NEXT: vmovd %esi, %xmm2 +; AVX2-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $2, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $3, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $4, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $5, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $6, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $7, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $8, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $8, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $9, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $9, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $10, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $10, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $11, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $11, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $12, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $13, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $13, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $14, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $15, %xmm1, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $15, %edx, %xmm2, %xmm1 +; AVX2-NEXT: vpextrb $1, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpextrb $0, %xmm0, %esi +; AVX2-NEXT: bsfl %esi, %esi +; AVX2-NEXT: cmovel %eax, %esi +; AVX2-NEXT: cmpl $32, %esi +; AVX2-NEXT: cmovel %ecx, %esi +; AVX2-NEXT: vmovd %esi, %xmm2 +; AVX2-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $2, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $3, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $3, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $4, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $5, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $6, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $6, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $7, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $7, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $8, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $8, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $9, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $9, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $10, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $10, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $11, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $11, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $12, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $13, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $13, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $14, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $15, %xmm0, %edx +; AVX2-NEXT: bsfl %edx, %edx +; AVX2-NEXT: cmovel %eax, %edx +; AVX2-NEXT: cmpl $32, %edx +; AVX2-NEXT: cmovel %ecx, %edx +; AVX2-NEXT: vpinsrb $15, %edx, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 0) + ret <32 x i8> %out +} + +define <32 x i8> @testv32i8u(<32 x i8> %in) { +; AVX1-LABEL: testv32i8u: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpextrb $1, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpextrb $0, %xmm1, %ecx +; AVX1-NEXT: bsfl %ecx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $2, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $3, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $4, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $5, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $6, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $7, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $8, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $9, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $10, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $11, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $12, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $13, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $14, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $15, %xmm1, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 +; AVX1-NEXT: vpextrb $1, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpextrb $0, %xmm0, %ecx +; AVX1-NEXT: bsfl %ecx, %ecx +; AVX1-NEXT: vmovd %ecx, %xmm2 +; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $3, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $5, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $7, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $9, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $11, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $13, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX1-NEXT: vpextrb $15, %xmm0, %eax +; AVX1-NEXT: bsfl %eax, %eax +; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv32i8u: +; AVX2: # BB#0: +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrb $1, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpextrb $0, %xmm1, %ecx +; AVX2-NEXT: bsfl %ecx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $2, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $3, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $4, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $5, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $6, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $7, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $8, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $9, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $10, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $11, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $12, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $13, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $14, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $15, %xmm1, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1 +; AVX2-NEXT: vpextrb $1, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpextrb $0, %xmm0, %ecx +; AVX2-NEXT: bsfl %ecx, %ecx +; AVX2-NEXT: vmovd %ecx, %xmm2 +; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $3, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $5, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $7, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $9, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $11, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $13, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 +; AVX2-NEXT: vpextrb $15, %xmm0, %eax +; AVX2-NEXT: bsfl %eax, %eax +; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: retq + %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %in, i1 -1) + ret <32 x i8> %out +} + +define <4 x i64> @foldv4i64() { +; AVX-LABEL: foldv4i64: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; AVX-NEXT: retq + %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0) + ret <4 x i64> %out +} + +define <4 x i64> @foldv4i64u() { +; AVX-LABEL: foldv4i64u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0] +; AVX-NEXT: retq + %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1) + ret <4 x i64> %out +} + +define <8 x i32> @foldv8i32() { +; AVX-LABEL: foldv8i32: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX-NEXT: retq + %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0) + ret <8 x i32> %out +} + +define <8 x i32> @foldv8i32u() { +; AVX-LABEL: foldv8i32u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3] +; AVX-NEXT: retq + %out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1) + ret <8 x i32> %out +} + +define <16 x i16> @foldv16i16() { +; AVX-LABEL: foldv16i16: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX-NEXT: retq + %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0) + ret <16 x i16> %out +} + +define <16 x i16> @foldv16i16u() { +; AVX-LABEL: foldv16i16u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5] +; AVX-NEXT: retq + %out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1) + ret <16 x i16> %out +} + +define <32 x i8> @foldv32i8() { +; AVX-LABEL: foldv32i8: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX-NEXT: retq + %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0) + ret <32 x i8> %out +} + +define <32 x i8> @foldv32i8u() { +; AVX-LABEL: foldv32i8u: +; AVX: # BB#0: +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0] +; AVX-NEXT: retq + %out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1) + ret <32 x i8> %out +} + +declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) +declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>, i1) +declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1) +declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1) diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll index 42781830ff2f..c64e17442675 100644 --- a/test/CodeGen/X86/vector-zext.ll +++ b/test/CodeGen/X86/vector-zext.ll @@ -9,7 +9,6 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: # kill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pand .LCPI0_0(%rip), %xmm1 @@ -19,7 +18,6 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: # kill ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pand .LCPI0_0(%rip), %xmm1 @@ -156,7 +154,6 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) { ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: # kill ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: pand .LCPI3_0(%rip), %xmm1 @@ -166,7 +163,6 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) { ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: # kill ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: pand .LCPI3_0(%rip), %xmm1 @@ -334,7 +330,6 @@ define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: # kill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSE2-NEXT: retq @@ -343,7 +338,6 @@ define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: # kill ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: retq @@ -366,7 +360,6 @@ define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ; ; AVX2-LABEL: shuf_zext_8i16_to_8i32: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: # kill ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: retq entry: @@ -380,7 +373,6 @@ define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: # kill ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: retq @@ -389,7 +381,6 @@ define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movdqa %xmm0, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: # kill ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSSE3-NEXT: retq @@ -413,7 +404,6 @@ define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ; ; AVX2-LABEL: shuf_zext_4i32_to_4i64: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: # kill ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: retq entry: diff --git a/test/CodeGen/X86/win32-eh-states.ll b/test/CodeGen/X86/win32-eh-states.ll new file mode 100644 index 000000000000..8db127df6da7 --- /dev/null +++ b/test/CodeGen/X86/win32-eh-states.ll @@ -0,0 +1,112 @@ +; RUN: llc -mtriple=i686-pc-windows-msvc < %s | FileCheck %s + +; Based on this source: +; extern "C" void may_throw(int); +; void f() { +; try { +; may_throw(1); +; try { +; may_throw(2); +; } catch (int) { +; may_throw(3); +; } +; } catch (int) { +; may_throw(4); +; } +; } + +%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] } +%eh.CatchHandlerType = type { i32, i8* } + +declare void @may_throw(i32) +declare i32 @__CxxFrameHandler3(...) +declare void @llvm.eh.begincatch(i8*, i8*) +declare void @llvm.eh.endcatch() +declare i32 @llvm.eh.typeid.for(i8*) + +$"\01??_R0H@8" = comdat any + +@"\01??_7type_info@@6B@" = external constant i8* +@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat +@llvm.eh.handlertype.H.0 = private unnamed_addr constant %eh.CatchHandlerType { i32 0, i8* bitcast (%rtti.TypeDescriptor2* @"\01??_R0H@8" to i8*) }, section "llvm.metadata" + +define void @f() #0 { +entry: + invoke void @may_throw(i32 1) + to label %invoke.cont unwind label %lpad + +invoke.cont: ; preds = %entry + invoke void @may_throw(i32 2) + to label %try.cont.9 unwind label %lpad.1 + +try.cont.9: ; preds = %invoke.cont.3, %invoke.cont, %catch.7 + ; FIXME: Something about our CFG breaks TailDuplication. This empy asm blocks + ; it so we can focus on testing the state numbering. + call void asm sideeffect "", "~{dirflag},~{fpsr},~{flags}"() + ret void + +lpad: ; preds = %catch, %entry + %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) + catch %eh.CatchHandlerType* @llvm.eh.handlertype.H.0 + %1 = extractvalue { i8*, i32 } %0, 0 + %2 = extractvalue { i8*, i32 } %0, 1 + br label %catch.dispatch.4 + +lpad.1: ; preds = %invoke.cont + %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) + catch i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*) + %4 = extractvalue { i8*, i32 } %3, 0 + %5 = extractvalue { i8*, i32 } %3, 1 + %6 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)) #3 + %matches = icmp eq i32 %5, %6 + br i1 %matches, label %catch, label %catch.dispatch.4 + +catch.dispatch.4: ; preds = %lpad.1, %lpad + %exn.slot.0 = phi i8* [ %4, %lpad.1 ], [ %1, %lpad ] + %ehselector.slot.0 = phi i32 [ %5, %lpad.1 ], [ %2, %lpad ] + %.pre = tail call i32 @llvm.eh.typeid.for(i8* bitcast (%eh.CatchHandlerType* @llvm.eh.handlertype.H.0 to i8*)) #3 + %matches6 = icmp eq i32 %ehselector.slot.0, %.pre + br i1 %matches6, label %catch.7, label %eh.resume + +catch.7: ; preds = %catch.dispatch.4 + tail call void @llvm.eh.begincatch(i8* %exn.slot.0, i8* null) #3 + tail call void @may_throw(i32 4) + tail call void @llvm.eh.endcatch() #3 + br label %try.cont.9 + +catch: ; preds = %lpad.1 + tail call void @llvm.eh.begincatch(i8* %4, i8* null) #3 + invoke void @may_throw(i32 3) + to label %invoke.cont.3 unwind label %lpad + +invoke.cont.3: ; preds = %catch + tail call void @llvm.eh.endcatch() #3 + br label %try.cont.9 + +eh.resume: ; preds = %catch.dispatch.4 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0 + %lpad.val.12 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1 + resume { i8*, i32 } %lpad.val.12 +} + +; CHECK-LABEL: _f: +; CHECK: movl $-1, [[state:[-0-9]+]](%ebp) +; CHECK: movl $___ehhandler$f, {{.*}} +; +; CHECK: movl $0, [[state]](%ebp) +; CHECK: movl $1, (%esp) +; CHECK: calll _may_throw +; +; CHECK: movl $1, [[state]](%ebp) +; CHECK: movl $2, (%esp) +; CHECK: calll _may_throw + +; CHECK-LABEL: _f.catch: +; CHECK: movl $4, Lf$frame_escape_{{[0-9]+.*}} +; CHECK: movl $4, (%esp) +; CHECK: calll _may_throw + +; CHECK-LABEL: _f.catch.1: +; CHECK: movl $3, Lf$frame_escape_{{[0-9]+.*}} +; CHECK: movl $3, (%esp) +; CHECK: calll _may_throw diff --git a/test/CodeGen/X86/win32-eh.ll b/test/CodeGen/X86/win32-eh.ll index 4d3c34ed6c17..42c9d9e2240d 100644 --- a/test/CodeGen/X86/win32-eh.ll +++ b/test/CodeGen/X86/win32-eh.ll @@ -19,16 +19,18 @@ catchall: } ; CHECK-LABEL: _use_except_handler3: +; CHECK: pushl %ebp +; CHECK: movl %esp, %ebp ; CHECK: subl ${{[0-9]+}}, %esp -; CHECK: movl $-1, 12(%esp) -; CHECK: movl $L__ehtable$use_except_handler3, 8(%esp) -; CHECK: movl $__except_handler3, 4(%esp) +; CHECK: movl $-1, -4(%ebp) +; CHECK: movl $L__ehtable$use_except_handler3, -8(%ebp) +; CHECK: leal -16(%ebp), %[[node:[^ ,]*]] +; CHECK: movl $__except_handler3, -12(%ebp) ; CHECK: movl %fs:0, %[[next:[^ ,]*]] -; CHECK: movl %[[next]], (%esp) -; CHECK: leal (%esp), %[[node:[^ ,]*]] +; CHECK: movl %[[next]], -16(%ebp) ; CHECK: movl %[[node]], %fs:0 ; CHECK: calll _may_throw_or_crash -; CHECK: movl (%esp), %[[next:[^ ,]*]] +; CHECK: movl -16(%ebp), %[[next:[^ ,]*]] ; CHECK: movl %[[next]], %fs:0 ; CHECK: retl @@ -44,17 +46,21 @@ catchall: } ; CHECK-LABEL: _use_except_handler4: +; CHECK: pushl %ebp +; CHECK: movl %esp, %ebp ; CHECK: subl ${{[0-9]+}}, %esp -; CHECK: movl %esp, (%esp) -; CHECK: movl $-2, 20(%esp) -; CHECK: movl $L__ehtable$use_except_handler4, 4(%esp) -; CHECK: leal 8(%esp), %[[node:[^ ,]*]] -; CHECK: movl $__except_handler4, 12(%esp) +; CHECK: movl %esp, -24(%ebp) +; CHECK: movl $-2, -4(%ebp) +; CHECK: movl $L__ehtable$use_except_handler4, %[[lsda:[^ ,]*]] +; CHECK: xorl ___security_cookie, %[[lsda]] +; CHECK: movl %[[lsda]], -8(%ebp) +; CHECK: leal -16(%ebp), %[[node:[^ ,]*]] +; CHECK: movl $__except_handler4, -12(%ebp) ; CHECK: movl %fs:0, %[[next:[^ ,]*]] -; CHECK: movl %[[next]], 8(%esp) +; CHECK: movl %[[next]], -16(%ebp) ; CHECK: movl %[[node]], %fs:0 ; CHECK: calll _may_throw_or_crash -; CHECK: movl 8(%esp), %[[next:[^ ,]*]] +; CHECK: movl -16(%ebp), %[[next:[^ ,]*]] ; CHECK: movl %[[next]], %fs:0 ; CHECK: retl @@ -73,19 +79,34 @@ catchall: } ; CHECK-LABEL: _use_CxxFrameHandler3: +; CHECK: pushl %ebp +; CHECK: movl %esp, %ebp ; CHECK: subl ${{[0-9]+}}, %esp -; CHECK: movl %esp, (%esp) -; CHECK: movl $-1, 12(%esp) -; CHECK: leal 4(%esp), %[[node:[^ ,]*]] -; CHECK: movl $___ehhandler$use_CxxFrameHandler3, 8(%esp) +; CHECK: movl %esp, -16(%ebp) +; CHECK: movl $-1, -4(%ebp) +; CHECK: leal -12(%ebp), %[[node:[^ ,]*]] +; CHECK: movl $___ehhandler$use_CxxFrameHandler3, -8(%ebp) ; CHECK: movl %fs:0, %[[next:[^ ,]*]] -; CHECK: movl %[[next]], 4(%esp) +; CHECK: movl %[[next]], -12(%ebp) ; CHECK: movl %[[node]], %fs:0 +; CHECK: movl $0, -4(%ebp) ; CHECK: calll _may_throw_or_crash -; CHECK: movl 4(%esp), %[[next:[^ ,]*]] +; CHECK: movl -12(%ebp), %[[next:[^ ,]*]] ; CHECK: movl %[[next]], %fs:0 ; CHECK: retl +; CHECK: .section .xdata,"dr" +; CHECK-LABEL: L__ehtable$use_CxxFrameHandler3: +; CHECK-NEXT: .long 429065506 +; CHECK-NEXT: .long 2 +; CHECK-NEXT: .long ($stateUnwindMap$use_CxxFrameHandler3) +; CHECK-NEXT: .long 1 +; CHECK-NEXT: .long ($tryMap$use_CxxFrameHandler3) +; CHECK-NEXT: .long 0 +; CHECK-NEXT: .long 0 +; CHECK-NEXT: .long 0 +; CHECK-NEXT: .long 1 + ; CHECK-LABEL: ___ehhandler$use_CxxFrameHandler3: ; CHECK: movl $L__ehtable$use_CxxFrameHandler3, %eax ; CHECK: jmp ___CxxFrameHandler3 # TAILCALL |