diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-12-24 14:04:50 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-12-24 14:04:50 +0000 |
commit | fd4675b5a029cce616a1b0ad339344c5df800ea6 (patch) | |
tree | 90692ad6d7101214860cca83c219d3c001515d02 /test/CodeGen/X86/shrink_vmul.ll | |
parent | c7dac04c3480f3c20487f912f77343139fce2d99 (diff) | |
download | src-fd4675b5a029cce616a1b0ad339344c5df800ea6.tar.gz src-fd4675b5a029cce616a1b0ad339344c5df800ea6.zip |
Vendor import of llvm trunk r321426:vendor/llvm/llvm-trunk-r321426
Notes
Notes:
svn path=/vendor/llvm/dist/; revision=327152
svn path=/vendor/llvm/llvm-trunk-r321426/; revision=327153; tag=vendor/llvm/llvm-trunk-r321426
Diffstat (limited to 'test/CodeGen/X86/shrink_vmul.ll')
-rw-r--r-- | test/CodeGen/X86/shrink_vmul.ll | 2995 |
1 files changed, 2056 insertions, 939 deletions
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll index 5700b1df15bd..a516c709517d 100644 --- a/test/CodeGen/X86/shrink_vmul.ll +++ b/test/CodeGen/X86/shrink_vmul.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 @c = external global i32*, align 8 @@ -11,42 +15,69 @@ ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi8: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movzwl (%edx,%ecx), %edx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movzwl (%eax,%ecx), %eax -; X86-NEXT: movd %eax, %xmm1 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movq %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm1 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movq %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -71,38 +102,63 @@ entry: ; %rst = mul <4 x i32> %op1, %op2 ; define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_4xi8: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: mul_4xi8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movdqu %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_4xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_4xi8: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_4xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_4xi8: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -127,44 +183,106 @@ entry: ; %rst = mul <8 x i32> %op1, %op2 ; define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_8xi8: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: mul_8xi8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_8xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_8xi8: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_8xi8: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_8xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_8xi8: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_8xi8: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -189,64 +307,150 @@ entry: ; %rst = mul <16 x i32> %op1, %op2 ; define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_16xi8: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-NEXT: movdqu (%eax,%ecx), %xmm1 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X86-NEXT: movdqa %xmm1, %xmm4 -; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X86-NEXT: pmullw %xmm3, %xmm4 -; X86-NEXT: movdqa %xmm4, %xmm3 -; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X86-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; X86-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, 32(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm3, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: mul_16xi8: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-NEXT: movdqu (%rsi,%rdx), %xmm1 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movdqa %xmm1, %xmm4 -; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X64-NEXT: pmullw %xmm3, %xmm4 -; X64-NEXT: movdqa %xmm4, %xmm3 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; X64-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm3, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_16xi8: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X86-SSE-NEXT: movdqa %xmm1, %xmm4 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE-NEXT: pmullw %xmm3, %xmm4 +; X86-SSE-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_16xi8: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi8: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi8: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X64-SSE-NEXT: movdqa %xmm1, %xmm4 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X64-SSE-NEXT: pmullw %xmm3, %xmm4 +; X64-SSE-NEXT: movdqa %xmm4, %xmm3 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi8: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi8: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -271,36 +475,65 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi16: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhuw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movq %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhuw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movq %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -325,36 +558,61 @@ entry: ; %rst = mul <4 x i32> %op1, %op2 ; define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_4xi16: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhuw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: mul_4xi16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhuw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movdqu %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_4xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_4xi16: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_4xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_4xi16: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -379,42 +637,104 @@ entry: ; %rst = mul <8 x i32> %op1, %op2 ; define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_8xi16: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-NEXT: movdqu (%eax,%ecx), %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhuw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: movdqa %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: mul_8xi16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-NEXT: movdqu (%rsi,%rdx), %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhuw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_8xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_8xi16: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_8xi16: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_8xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_8xi16: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_8xi16: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -439,62 +759,148 @@ entry: ; %rst = mul <16 x i32> %op1, %op2 ; define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_16xi16: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-NEXT: movdqu 16(%edx,%ecx), %xmm1 -; X86-NEXT: movdqu (%eax,%ecx), %xmm2 -; X86-NEXT: movdqu 16(%eax,%ecx), %xmm3 -; X86-NEXT: movdqa %xmm2, %xmm4 -; X86-NEXT: pmulhuw %xmm0, %xmm4 -; X86-NEXT: pmullw %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-NEXT: movdqa %xmm3, %xmm4 -; X86-NEXT: pmulhuw %xmm1, %xmm4 -; X86-NEXT: pmullw %xmm1, %xmm3 -; X86-NEXT: movdqa %xmm3, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: mul_16xi16: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-NEXT: movdqu 16(%rdi,%rdx), %xmm1 -; X64-NEXT: movdqu (%rsi,%rdx), %xmm2 -; X64-NEXT: movdqu 16(%rsi,%rdx), %xmm3 -; X64-NEXT: movdqa %xmm2, %xmm4 -; X64-NEXT: pmulhuw %xmm0, %xmm4 -; X64-NEXT: pmullw %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X64-NEXT: movdqa %xmm3, %xmm4 -; X64-NEXT: pmulhuw %xmm1, %xmm4 -; X64-NEXT: pmullw %xmm1, %xmm3 -; X64-NEXT: movdqa %xmm3, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_16xi16: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 +; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 +; X86-SSE-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 +; X86-SSE-NEXT: pmullw %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 +; X86-SSE-NEXT: pmullw %xmm1, %xmm3 +; X86-SSE-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_16xi16: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi16: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 +; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; X64-SSE-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 +; X64-SSE-NEXT: pmullw %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 +; X64-SSE-NEXT: pmullw %xmm1, %xmm3 +; X64-SSE-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi16: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi16: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -519,46 +925,73 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi8_sext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movzwl (%edx,%ecx), %edx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movzwl (%eax,%ecx), %eax -; X86-NEXT: movd %eax, %xmm1 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm1 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: movq %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi8_sext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm1 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: movq %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi8_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm1 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_sext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm1 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_sext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -583,48 +1016,75 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi8_sext_zext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movzwl (%edx,%ecx), %edx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movzwl (%eax,%ecx), %eax -; X86-NEXT: movd %eax, %xmm1 -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi8_sext_zext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-NEXT: movd %ecx, %xmm1 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi8_sext_zext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax +; X86-SSE-NEXT: movd %eax, %xmm1 +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_sext_zext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_sext_zext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm1 +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_sext_zext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -649,36 +1109,63 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi16_sext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: pmulhw %xmm0, %xmm2 -; X86-NEXT: pmullw %xmm0, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: movq %xmm1, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi16_sext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pmulhw %xmm0, %xmm2 -; X64-NEXT: pmullw %xmm0, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: movq %xmm1, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi16_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X86-SSE-NEXT: pmullw %xmm0, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_sext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1 +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 +; X64-SSE-NEXT: pmullw %xmm0, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_sext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1 +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -703,62 +1190,93 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_2xi16_sext_zext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-NEXT: pxor %xmm2, %xmm2 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X86-NEXT: movdqa %xmm1, %xmm2 -; X86-NEXT: psrlq $32, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm0, %xmm3 -; X86-NEXT: psrlq $32, %xmm3 -; X86-NEXT: pmuludq %xmm1, %xmm3 -; X86-NEXT: paddq %xmm2, %xmm3 -; X86-NEXT: psllq $32, %xmm3 -; X86-NEXT: pmuludq %xmm0, %xmm1 -; X86-NEXT: paddq %xmm3, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X86-NEXT: movq %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi16_sext_zext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: psrlq $32, %xmm2 -; X64-NEXT: pmuludq %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psrlq $32, %xmm3 -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: paddq %xmm2, %xmm3 -; X64-NEXT: psllq $32, %xmm3 -; X64-NEXT: pmuludq %xmm0, %xmm1 -; X64-NEXT: paddq %xmm3, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi16_sext_zext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; X86-SSE-NEXT: movdqa %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE-NEXT: psrlq $32, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X86-SSE-NEXT: paddq %xmm2, %xmm3 +; X86-SSE-NEXT: psllq $32, %xmm3 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X86-SSE-NEXT: paddq %xmm3, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_sext_zext: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: pushl %esi +; X86-AVX-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX-NEXT: .cfi_offset %esi, -8 +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX-NEXT: movl c, %esi +; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 +; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) +; X86-AVX-NEXT: popl %esi +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_sext_zext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] +; X64-SSE-NEXT: movdqa %xmm1, %xmm2 +; X64-SSE-NEXT: psrlq $32, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE-NEXT: psrlq $32, %xmm3 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 +; X64-SSE-NEXT: paddq %xmm2, %xmm3 +; X64-SSE-NEXT: psllq $32, %xmm3 +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: paddq %xmm3, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_sext_zext: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 +; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -783,62 +1301,148 @@ entry: ; %rst = mul <16 x i32> %op1, %op2 ; define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { -; X86-LABEL: mul_16xi16_sext: -; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl c, %esi -; X86-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-NEXT: movdqu 16(%edx,%ecx), %xmm1 -; X86-NEXT: movdqu (%eax,%ecx), %xmm2 -; X86-NEXT: movdqu 16(%eax,%ecx), %xmm3 -; X86-NEXT: movdqa %xmm2, %xmm4 -; X86-NEXT: pmulhw %xmm0, %xmm4 -; X86-NEXT: pmullw %xmm0, %xmm2 -; X86-NEXT: movdqa %xmm2, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X86-NEXT: movdqa %xmm3, %xmm4 -; X86-NEXT: pmulhw %xmm1, %xmm4 -; X86-NEXT: pmullw %xmm1, %xmm3 -; X86-NEXT: movdqa %xmm3, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-NEXT: popl %esi -; X86-NEXT: retl -; -; X64-LABEL: mul_16xi16_sext: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-NEXT: movdqu 16(%rdi,%rdx), %xmm1 -; X64-NEXT: movdqu (%rsi,%rdx), %xmm2 -; X64-NEXT: movdqu 16(%rsi,%rdx), %xmm3 -; X64-NEXT: movdqa %xmm2, %xmm4 -; X64-NEXT: pmulhw %xmm0, %xmm4 -; X64-NEXT: pmullw %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; X64-NEXT: movdqa %xmm3, %xmm4 -; X64-NEXT: pmulhw %xmm1, %xmm4 -; X64-NEXT: pmullw %xmm1, %xmm3 -; X64-NEXT: movdqa %xmm3, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_16xi16_sext: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %esi, -8 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: movl c, %esi +; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 +; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 +; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 +; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 +; X86-SSE-NEXT: movdqa %xmm2, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 +; X86-SSE-NEXT: pmullw %xmm0, %xmm2 +; X86-SSE-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 +; X86-SSE-NEXT: pmullw %xmm1, %xmm3 +; X86-SSE-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: mul_16xi16_sext: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %esi, -8 +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX1-NEXT: movl c, %esi +; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm1 +; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm2 +; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm3 +; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4 +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) +; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: mul_16xi16_sext: +; X86-AVX2: # %bb.0: # %entry +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-AVX2-NEXT: movl c, %esi +; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0 +; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1 +; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2 +; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2 +; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) +; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: mul_16xi16_sext: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 +; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 +; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 +; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 +; X64-SSE-NEXT: movdqa %xmm2, %xmm4 +; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 +; X64-SSE-NEXT: pmullw %xmm0, %xmm2 +; X64-SSE-NEXT: movdqa %xmm2, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: movdqa %xmm3, %xmm4 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 +; X64-SSE-NEXT: pmullw %xmm1, %xmm3 +; X64-SSE-NEXT: movdqa %xmm3, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: mul_16xi16_sext: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm1 +; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm2 +; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm3 +; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 +; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 +; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 +; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4 +; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) +; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: mul_16xi16_sext: +; X64-AVX2: # %bb.0: # %entry +; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0 +; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1 +; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2 +; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2 +; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) +; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -862,31 +1466,54 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst1: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi8_varconst1: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: pmullw {{.*}}(%rip), %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi8_varconst1: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst1: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst1: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst1: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: movl $255, %ecx +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -906,33 +1533,53 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst2: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi8_varconst2: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: pmullw {{.*}}(%rip), %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi8_varconst2: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst2: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst2: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst2: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -952,37 +1599,60 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst3: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi8_varconst3: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi8_varconst3: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst3: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst3: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst3: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100 +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1002,37 +1672,57 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst4: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi8_varconst4: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi8_varconst4: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst4: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst4: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst4: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1052,37 +1742,57 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst5: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi8_varconst5: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi8_varconst5: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst5: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst5: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst5: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1102,37 +1812,57 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi8_varconst6: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movzwl (%ecx,%eax), %ecx -; X86-NEXT: movd %ecx, %xmm0 -; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-NEXT: psraw $8, %xmm0 -; X86-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi8_varconst6: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-NEXT: psraw $8, %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi8_varconst6: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X86-SSE-NEXT: psraw $8, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi8_varconst6: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi8_varconst6: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx +; X64-SSE-NEXT: movd %ecx, %xmm0 +; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X64-SSE-NEXT: psraw $8, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi8_varconst6: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1152,31 +1882,58 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi16_varconst1: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhuw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi16_varconst1: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhuw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi16_varconst1: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_varconst1: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst1: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst1: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-AVX-NEXT: movl $65535, %ecx # imm = 0xFFFF +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1196,31 +1953,51 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi16_varconst2: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmulhw %xmm1, %xmm2 -; X86-NEXT: pmullw %xmm1, %xmm0 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi16_varconst2: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmulhw %xmm1, %xmm2 -; X64-NEXT: pmullw %xmm1, %xmm0 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi16_varconst2: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X86-SSE-NEXT: pmullw %xmm1, %xmm0 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_varconst2: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst2: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 +; X64-SSE-NEXT: pmullw %xmm1, %xmm0 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst2: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1240,45 +2017,72 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi16_varconst3: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pxor %xmm1, %xmm1 -; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: psrlq $32, %xmm0 -; X86-NEXT: pmuludq %xmm1, %xmm0 -; X86-NEXT: psllq $32, %xmm0 -; X86-NEXT: paddq %xmm2, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi16_varconst3: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: pxor %xmm1, %xmm1 -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-NEXT: movl $65536, %ecx # imm = 0x10000 -; X64-NEXT: movq %rcx, %xmm1 -; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: psrlq $32, %xmm0 -; X64-NEXT: pmuludq %xmm1, %xmm0 -; X64-NEXT: psllq $32, %xmm0 -; X64-NEXT: paddq %xmm2, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi16_varconst3: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pxor %xmm1, %xmm1 +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm0 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: psllq $32, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_varconst3: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst3: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pxor %xmm1, %xmm1 +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000 +; X64-SSE-NEXT: movq %rcx, %xmm1 +; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: psrlq $32, %xmm0 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst3: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; X64-AVX-NEXT: movl $65536, %ecx # imm = 0x10000 +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1298,45 +2102,68 @@ entry: ; %rst = mul <2 x i32> %op1, %op2 ; define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { -; X86-LABEL: mul_2xi16_varconst4: -; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl c, %edx -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X86-NEXT: psrad $16, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0] -; X86-NEXT: movdqa %xmm0, %xmm2 -; X86-NEXT: pmuludq %xmm1, %xmm2 -; X86-NEXT: psrlq $32, %xmm0 -; X86-NEXT: pmuludq %xmm1, %xmm0 -; X86-NEXT: psllq $32, %xmm0 -; X86-NEXT: paddq %xmm2, %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X86-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-NEXT: retl -; -; X64-LABEL: mul_2xi16_varconst4: -; X64: # %bb.0: # %entry -; X64-NEXT: movq {{.*}}(%rip), %rax -; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X64-NEXT: psrad $16, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X64-NEXT: movl $32768, %ecx # imm = 0x8000 -; X64-NEXT: movq %rcx, %xmm1 -; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: pmuludq %xmm1, %xmm2 -; X64-NEXT: psrlq $32, %xmm0 -; X64-NEXT: pmuludq %xmm1, %xmm0 -; X64-NEXT: psllq $32, %xmm0 -; X64-NEXT: paddq %xmm2, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-NEXT: retq +; X86-SSE-LABEL: mul_2xi16_varconst4: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl c, %edx +; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X86-SSE-NEXT: psrad $16, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0] +; X86-SSE-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: psrlq $32, %xmm0 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X86-SSE-NEXT: psllq $32, %xmm0 +; X86-SSE-NEXT: paddq %xmm2, %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) +; X86-SSE-NEXT: retl +; +; X86-AVX-LABEL: mul_2xi16_varconst4: +; X86-AVX: # %bb.0: # %entry +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movl c, %edx +; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 +; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) +; X86-AVX-NEXT: retl +; +; X64-SSE-LABEL: mul_2xi16_varconst4: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: movq {{.*}}(%rip), %rax +; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X64-SSE-NEXT: psrad $16, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 +; X64-SSE-NEXT: movq %rcx, %xmm1 +; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: psrlq $32, %xmm0 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 +; X64-SSE-NEXT: psllq $32, %xmm0 +; X64-SSE-NEXT: paddq %xmm2, %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) +; X64-SSE-NEXT: retq +; +; X64-AVX-LABEL: mul_2xi16_varconst4: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: movq {{.*}}(%rip), %rax +; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 +; X64-AVX-NEXT: movl $32768, %ecx # imm = 0x8000 +; X64-AVX-NEXT: vmovq %rcx, %xmm1 +; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) +; X64-AVX-NEXT: retq entry: %pre = load i32*, i32** @c %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index @@ -1355,99 +2182,389 @@ entry: ; define void @PR34947() { -; X86-LABEL: PR34947: -; X86: # %bb.0: -; X86-NEXT: movdqa (%eax), %xmm0 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; X86-NEXT: movd %xmm1, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ecx -; X86-NEXT: movd %edx, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; X86-NEXT: movd %xmm2, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ecx -; X86-NEXT: movd %edx, %xmm2 -; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ecx -; X86-NEXT: movd %edx, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-NEXT: movd %xmm0, %ecx -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl %ecx -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: divl (%eax) -; X86-NEXT: movd %edx, %xmm0 -; X86-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] -; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X86-NEXT: pmuludq %xmm2, %xmm1 -; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X86-NEXT: pmuludq %xmm2, %xmm3 -; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X86-NEXT: movl $8199, %eax # imm = 0x2007 -; X86-NEXT: movd %eax, %xmm2 -; X86-NEXT: pmuludq %xmm0, %xmm2 -; X86-NEXT: movd %xmm2, (%eax) -; X86-NEXT: movdqa %xmm1, (%eax) -; X86-NEXT: retl -; -; X64-LABEL: PR34947: -; X64: # %bb.0: -; X64-NEXT: movdqa (%rax), %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; X64-NEXT: movd %xmm1, %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ecx -; X64-NEXT: movd %edx, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; X64-NEXT: movd %xmm2, %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ecx -; X64-NEXT: movd %edx, %xmm2 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ecx -; X64-NEXT: movd %edx, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl %ecx -; X64-NEXT: movd %edx, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: divl (%rax) -; X64-NEXT: movd %edx, %xmm0 -; X64-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X64-NEXT: pmuludq %xmm2, %xmm1 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X64-NEXT: pmuludq %xmm2, %xmm3 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X64-NEXT: movl $8199, %eax # imm = 0x2007 -; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pmuludq %xmm0, %xmm2 -; X64-NEXT: movd %xmm2, (%rax) -; X64-NEXT: movdqa %xmm1, (%rax) -; X64-NEXT: retq +; X86-SSE-LABEL: PR34947: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movdqa (%eax), %xmm0 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; X86-SSE-NEXT: movd %xmm1, %ecx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %ecx +; X86-SSE-NEXT: movd %edx, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE-NEXT: movd %xmm2, %ecx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %ecx +; X86-SSE-NEXT: movd %edx, %xmm2 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X86-SSE-NEXT: movd %xmm0, %ecx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %ecx +; X86-SSE-NEXT: movd %edx, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE-NEXT: movd %xmm0, %ecx +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl %ecx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X86-SSE-NEXT: xorl %eax, %eax +; X86-SSE-NEXT: xorl %edx, %edx +; X86-SSE-NEXT: divl (%eax) +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm3 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-SSE-NEXT: movl $8199, %eax # imm = 0x2007 +; X86-SSE-NEXT: movd %eax, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X86-SSE-NEXT: movd %xmm2, (%eax) +; X86-SSE-NEXT: movdqa %xmm1, (%eax) +; X86-SSE-NEXT: retl +; +; X86-AVX1-LABEL: PR34947: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: pushl %ebx +; X86-AVX1-NEXT: .cfi_def_cfa_offset 12 +; X86-AVX1-NEXT: pushl %edi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 16 +; X86-AVX1-NEXT: pushl %esi +; X86-AVX1-NEXT: .cfi_def_cfa_offset 20 +; X86-AVX1-NEXT: subl $16, %esp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 36 +; X86-AVX1-NEXT: .cfi_offset %esi, -20 +; X86-AVX1-NEXT: .cfi_offset %edi, -16 +; X86-AVX1-NEXT: .cfi_offset %ebx, -12 +; X86-AVX1-NEXT: .cfi_offset %ebp, -8 +; X86-AVX1-NEXT: vmovdqa (%eax), %ymm0 +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl (%eax) +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-AVX1-NEXT: vmovd %xmm0, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, %ebp +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X86-AVX1-NEXT: divl %ecx +; X86-AVX1-NEXT: movl %edx, %ecx +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vpextrd $2, %xmm0, %esi +; X86-AVX1-NEXT: divl %esi +; X86-AVX1-NEXT: movl %edx, %esi +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edi +; X86-AVX1-NEXT: divl %edi +; X86-AVX1-NEXT: movl %edx, %edi +; X86-AVX1-NEXT: xorl %eax, %eax +; X86-AVX1-NEXT: xorl %edx, %edx +; X86-AVX1-NEXT: vmovd %xmm0, %ebx +; X86-AVX1-NEXT: divl %ebx +; X86-AVX1-NEXT: vmovd %edx, %xmm0 +; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovd %ebp, %xmm1 +; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload +; X86-AVX1-NEXT: vmovd {{[0-9]+}}(%esp), %xmm2 # 4-byte Folded Reload +; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 +; X86-AVX1-NEXT: vmovd %eax, %xmm3 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199] +; X86-AVX1-NEXT: vpmulld %xmm4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmulld %xmm4, %xmm1, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm1 +; X86-AVX1-NEXT: vmovd %xmm1, (%eax) +; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX1-NEXT: addl $16, %esp +; X86-AVX1-NEXT: popl %esi +; X86-AVX1-NEXT: popl %edi +; X86-AVX1-NEXT: popl %ebx +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: vzeroupper +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: PR34947: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %esi +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %esi, -8 +; X86-AVX2-NEXT: vmovdqa (%eax), %ymm0 +; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vpextrd $1, %xmm1, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: movl %edx, %ecx +; X86-AVX2-NEXT: vmovd %xmm1, %esi +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %esi +; X86-AVX2-NEXT: vmovd %edx, %xmm2 +; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpextrd $3, %xmm1, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm1 +; X86-AVX2-NEXT: vpextrd $1, %xmm0, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: movl %edx, %ecx +; X86-AVX2-NEXT: vmovd %xmm0, %esi +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %esi +; X86-AVX2-NEXT: vmovd %edx, %xmm2 +; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpextrd $2, %xmm0, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X86-AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl %ecx +; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 +; X86-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: xorl %eax, %eax +; X86-AVX2-NEXT: xorl %edx, %edx +; X86-AVX2-NEXT: divl (%eax) +; X86-AVX2-NEXT: vmovd %edx, %xmm1 +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199] +; X86-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: movl $8199, %eax # imm = 0x2007 +; X86-AVX2-NEXT: vmovd %eax, %xmm2 +; X86-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; X86-AVX2-NEXT: vmovd %xmm1, (%eax) +; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) +; X86-AVX2-NEXT: popl %esi +; X86-AVX2-NEXT: vzeroupper +; X86-AVX2-NEXT: retl +; +; X64-SSE-LABEL: PR34947: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movdqa (%rax), %xmm0 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] +; X64-SSE-NEXT: movd %xmm1, %ecx +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X64-SSE-NEXT: movd %xmm2, %ecx +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm2 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-SSE-NEXT: movd %xmm0, %ecx +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X64-SSE-NEXT: movd %xmm0, %ecx +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl %ecx +; X64-SSE-NEXT: movd %edx, %xmm0 +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X64-SSE-NEXT: xorl %eax, %eax +; X64-SSE-NEXT: xorl %edx, %edx +; X64-SSE-NEXT: divl (%rax) +; X64-SSE-NEXT: movd %edx, %xmm0 +; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm3 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-SSE-NEXT: movl $8199, %eax # imm = 0x2007 +; X64-SSE-NEXT: movd %eax, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X64-SSE-NEXT: movd %xmm2, (%rax) +; X64-SSE-NEXT: movdqa %xmm1, (%rax) +; X64-SSE-NEXT: retq +; +; X64-AVX1-LABEL: PR34947: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rbp +; X64-AVX1-NEXT: .cfi_def_cfa_offset 16 +; X64-AVX1-NEXT: pushq %rbx +; X64-AVX1-NEXT: .cfi_def_cfa_offset 24 +; X64-AVX1-NEXT: .cfi_offset %rbx, -24 +; X64-AVX1-NEXT: .cfi_offset %rbp, -16 +; X64-AVX1-NEXT: vmovdqa (%rax), %ymm0 +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl (%rax) +; X64-AVX1-NEXT: movl %edx, %r8d +; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r9d +; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r10d +; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %r11d +; X64-AVX1-NEXT: vmovd %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %esi +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %edi +; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %edx, %ecx +; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ebx +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ebx +; X64-AVX1-NEXT: movl %edx, %ebx +; X64-AVX1-NEXT: vmovd %xmm0, %ebp +; X64-AVX1-NEXT: xorl %eax, %eax +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ebp +; X64-AVX1-NEXT: vmovd %edx, %xmm0 +; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovd %esi, %xmm2 +; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-AVX1-NEXT: vmovd %r8d, %xmm1 +; X64-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 +; X64-AVX1-NEXT: vmovd %eax, %xmm2 +; X64-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; X64-AVX1-NEXT: vmovd %xmm1, (%rax) +; X64-AVX1-NEXT: vmovaps %ymm0, (%rax) +; X64-AVX1-NEXT: popq %rbx +; X64-AVX1-NEXT: popq %rbp +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: PR34947: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vmovdqa (%rax), %ymm0 +; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vpextrd $1, %xmm1, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: movl %edx, %ecx +; X64-AVX2-NEXT: vmovd %xmm1, %esi +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %esi +; X64-AVX2-NEXT: vmovd %edx, %xmm2 +; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpextrd $2, %xmm1, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpextrd $3, %xmm1, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm1 +; X64-AVX2-NEXT: vpextrd $1, %xmm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: movl %edx, %ecx +; X64-AVX2-NEXT: vmovd %xmm0, %esi +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %esi +; X64-AVX2-NEXT: vmovd %edx, %xmm2 +; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpextrd $2, %xmm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 +; X64-AVX2-NEXT: vpextrd $3, %xmm0, %ecx +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 +; X64-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: xorl %eax, %eax +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl (%rax) +; X64-AVX2-NEXT: vmovd %edx, %xmm1 +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199] +; X64-AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: movl $8199, %eax # imm = 0x2007 +; X64-AVX2-NEXT: vmovd %eax, %xmm2 +; X64-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vmovd %xmm1, (%rax) +; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq %tmp = load <9 x i32>, <9 x i32>* undef, align 64 %rem = urem <9 x i32> zeroinitializer, %tmp %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem |