30 files changed, 4259 insertions, 1788 deletions
diff --git a/test/Analysis/CostModel/AArch64/gep.ll b/test/Analysis/CostModel/AArch64/gep.ll
new file mode 100644
index 000000000000..f3d83c133027
--- /dev/null
+++ b/test/Analysis/CostModel/AArch64/gep.ll
@@ -0,0 +1,292 @@
+; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define i8 @test1(i8* %p, i32 %i) {
+; CHECK-LABEL: test1
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
+  %a = getelementptr inbounds i8, i8* %p, i32 0
+  %v = load i8, i8* %a
+  ret i8 %v
+}
+
+define i16 @test2(i16* %p, i32 %i) {
+; CHECK-LABEL: test2
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
+  %a = getelementptr inbounds i16, i16* %p, i32 0
+  %v = load i16, i16* %a
+  ret i16 %v
+}
+
+define i32 @test3(i32* %p, i32 %i) {
+; CHECK-LABEL: test3
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32*
+  %a = getelementptr inbounds i32, i32* %p, i32 0
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+define i64 @test4(i64* %p, i32 %i) {
+; CHECK-LABEL: test4
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64*
+  %a = getelementptr inbounds i64, i64* %p, i32 0
+  %v = load i64, i64* %a
+  ret i64 %v
+}
+
+define i8 @test5(i8* %p, i32 %i) {
+; CHECK-LABEL: test5
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
+  %a = getelementptr inbounds i8, i8* %p, i32 1024
+  %v = load i8, i8* %a
+  ret i8 %v
+}
+
+define i16 @test6(i16* %p, i32 %i) {
+; CHECK-LABEL: test6
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
+  %a = getelementptr inbounds i16, i16* %p, i32 1024
+  %v = load i16, i16* %a
+  ret i16 %v
+}
+
+define i32 @test7(i32* %p, i32 %i) {
+; CHECK-LABEL: test7
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32*
+  %a = getelementptr inbounds i32, i32* %p, i32 1024
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+define i64 @test8(i64* %p, i32 %i) {
+; CHECK-LABEL: test8
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64*
+  %a = getelementptr inbounds i64, i64* %p, i32 1024
+  %v = load i64, i64* %a
+  ret i64 %v
+}
+
+define i8 @test9(i8* %p, i32 %i) {
+; CHECK-LABEL: test9
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8*
+  %a = getelementptr inbounds i8, i8* %p, i32 4096
+  %v = load i8, i8* %a
+  ret i8 %v
+}
+
+define i16 @test10(i16* %p, i32 %i) {
+; CHECK-LABEL: test10
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
+  %a = getelementptr inbounds i16, i16* %p, i32 4096
+  %v = load i16, i16* %a
+  ret i16 %v
+}
+
+define i32 @test11(i32* %p, i32 %i) {
+; CHECK-LABEL: test11
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
+  %a = getelementptr inbounds i32, i32* %p, i32 4096
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+define i64 @test12(i64* %p, i32 %i) {
+; CHECK-LABEL: test12
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
+  %a = getelementptr inbounds i64, i64* %p, i32 4096
+  %v = load i64, i64* %a
+  ret i64 %v
+}
+
+define i8 @test13(i8* %p, i32 %i) {
+; CHECK-LABEL: test13
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
+  %a = getelementptr inbounds i8, i8* %p, i32 -64
+  %v = load i8, i8* %a
+  ret i8 %v
+}
+
+define i16 @test14(i16* %p, i32 %i) {
+; CHECK-LABEL: test14
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
+  %a = getelementptr inbounds i16, i16* %p, i32 -64
+  %v = load i16, i16* %a
+  ret i16 %v
+}
+
+define i32 @test15(i32* %p, i32 %i) {
+; CHECK-LABEL: test15
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32*
+  %a = getelementptr inbounds i32, i32* %p, i32 -64
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+define i64 @test16(i64* %p, i32 %i) {
+; CHECK-LABEL: test16
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
+  %a = getelementptr inbounds i64, i64* %p, i32 -64
+  %v = load i64, i64* %a
+  ret i64 %v
+}
+
+define i8 @test17(i8* %p, i32 %i) {
+; CHECK-LABEL: test17
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8*
+  %a = getelementptr inbounds i8, i8* %p, i32 -1024
+  %v = load i8, i8* %a
+  ret i8 %v
+}
+
+define i16 @test18(i16* %p, i32 %i) {
+; CHECK-LABEL: test18
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
+  %a = getelementptr inbounds i16, i16* %p, i32 -1024
+  %v = load i16, i16* %a
+  ret i16 %v
+}
+
+define i32 @test19(i32* %p, i32 %i) {
+; CHECK-LABEL: test19
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
+  %a = getelementptr inbounds i32, i32* %p, i32 -1024
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+define i64 @test20(i64* %p, i32 %i) {
+; CHECK-LABEL: test20
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
+  %a = getelementptr inbounds i64, i64* %p, i32 -1024
+  %v = load i64, i64* %a
+  ret i64 %v
+}
+
+define i8 @test21(i8* %p, i32 %i) {
+; CHECK-LABEL: test21
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
+  %a = getelementptr inbounds i8, i8* %p, i32 %i
+  %v = load i8, i8* %a
+  ret i8 %v
+}
+
+define i16 @test22(i16* %p, i32 %i) {
+; CHECK-LABEL: test22
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
+  %a = getelementptr inbounds i16, i16* %p, i32 %i
+  %v = load i16, i16* %a
+  ret i16 %v
+}
+
+define i32 @test23(i32* %p, i32 %i) {
+; CHECK-LABEL: test23
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32*
+  %a = getelementptr inbounds i32, i32* %p, i32 %i
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+define i64 @test24(i64* %p, i32 %i) {
+; CHECK-LABEL: test24
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64*
+  %a = getelementptr inbounds i64, i64* %p, i32 %i
+  %v = load i64, i64* %a
+  ret i64 %v
+}
+
+define i8 @test25(i8* %p, i32 %i) {
+; CHECK-LABEL: test25
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
+  %a = getelementptr inbounds i8, i8* %p, i32 -128
+  %v = load i8, i8* %a
+  ret i8 %v
+}
+
+define i16 @test26(i16* %p, i32 %i) {
+; CHECK-LABEL: test26
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i16, i16*
+  %a = getelementptr inbounds i16, i16* %p, i32 -128
+  %v = load i16, i16* %a
+  ret i16 %v
+}
+
+define i32 @test27(i32* %p, i32 %i) {
+; CHECK-LABEL: test27
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
+  %a = getelementptr inbounds i32, i32* %p, i32 -128
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+define i64 @test28(i64* %p, i32 %i) {
+; CHECK-LABEL: test28
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
+  %a = getelementptr inbounds i64, i64* %p, i32 -128
+  %v = load i64, i64* %a
+  ret i64 %v
+}
+
+define i8 @test29(i8* %p, i32 %i) {
+; CHECK-LABEL: test29
+; CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
+  %a = getelementptr inbounds i8, i8* %p, i32 -256
+  %v = load i8, i8* %a
+  ret i8 %v
+}
+
+define i16 @test30(i16* %p, i32 %i) {
+; CHECK-LABEL: test30
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
+  %a = getelementptr inbounds i16, i16* %p, i32 -256
+  %v = load i16, i16* %a
+  ret i16 %v
+}
+
+define i32 @test31(i32* %p, i32 %i) {
+; CHECK-LABEL: test31
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
+  %a = getelementptr inbounds i32, i32* %p, i32 -256
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+define i64 @test32(i64* %p, i32 %i) {
+; CHECK-LABEL: test32
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
+  %a = getelementptr inbounds i64, i64* %p, i32 -256
+  %v = load i64, i64* %a
+  ret i64 %v
+}
+
+define i8 @test33(i8* %p, i32 %i) {
+; CHECK-LABEL: test33
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i8, i8*
+  %a = getelementptr inbounds i8, i8* %p, i32 -512
+  %v = load i8, i8* %a
+  ret i8 %v
+}
+
+define i16 @test34(i16* %p, i32 %i) {
+; CHECK-LABEL: test34
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i16, i16*
+  %a = getelementptr inbounds i16, i16* %p, i32 -512
+  %v = load i16, i16* %a
+  ret i16 %v
+}
+
+define i32 @test35(i32* %p, i32 %i) {
+; CHECK-LABEL: test35
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i32, i32*
+  %a = getelementptr inbounds i32, i32* %p, i32 -512
+  %v = load i32, i32* %a
+  ret i32 %v
+}
+
+define i64 @test36(i64* %p, i32 %i) {
+; CHECK-LABEL: test36
+; CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
+  %a = getelementptr inbounds i64, i64* %p, i32 -512
+  %v = load i64, i64* %a
+  ret i64 %v
+}
diff --git a/test/Analysis/CostModel/AArch64/store.ll b/test/Analysis/CostModel/AArch64/store.ll
index 307f8f8ee974..58750721cb97 100644
--- a/test/Analysis/CostModel/AArch64/store.ll
+++ b/test/Analysis/CostModel/AArch64/store.ll
@@ -1,10 +1,16 @@
-; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
+; RUN: opt < %s  -cost-model -analyze -mtriple=aarch64-apple-ios | FileCheck %s
+; RUN: opt < %s  -cost-model -analyze -mtriple=aarch64-apple-ios -mattr=slow-misaligned-128store | FileCheck %s --check-prefix=SLOW_MISALIGNED_128_STORE
+
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-; CHECK-LABEL: store
-define void @store() {
-    ; Stores of <2 x i64> should be expensive because we don't split them and
-    ; and unaligned 16b stores have bad performance.
-    ; CHECK: cost of 12 {{.*}} store
+; CHECK-LABEL: getMemoryOpCost
+; SLOW_MISALIGNED_128_STORE-LABEL: getMemoryOpCost
+define void @getMemoryOpCost() {
+    ; If FeatureSlowMisaligned128Store is set, we penalize <2 x i64> stores. On
+    ; Cyclone, for example, such stores should be expensive because we don't
+    ; split them and misaligned 16b stores have bad performance.
+    ;
+    ; CHECK: cost of 1 {{.*}} store
+    ; SLOW_MISALIGNED_128_STORE: cost of 12 {{.*}} store
     store <2 x i64> undef, <2 x i64> * undef
 
     ; We scalarize the loads/stores because there is no vector register name for
diff --git a/test/Analysis/CostModel/ARM/gep.ll b/test/Analysis/CostModel/ARM/gep.ll
index a70d6d42b61b..9d74da4c2d3b 100644
--- a/test/Analysis/CostModel/ARM/gep.ll
+++ b/test/Analysis/CostModel/ARM/gep.ll
@@ -44,17 +44,17 @@ define void @test_geps(i32 %i) {
   %b4 = getelementptr inbounds float, float* undef, i32 1024
 ;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds double, double*
   %b5 = getelementptr inbounds double, double* undef, i32 1024
-;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i8>, <4 x i8>*
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i8>, <4 x i8>*
   %b7 = getelementptr inbounds <4 x i8>, <4 x i8>* undef, i32 1
-;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i16>, <4 x i16>*
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i16>, <4 x i16>*
   %b8 = getelementptr inbounds <4 x i16>, <4 x i16>* undef, i32 1
-;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i32>, <4 x i32>*
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i32>, <4 x i32>*
   %b9 = getelementptr inbounds <4 x i32>, <4 x i32>* undef, i32 1
-;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i64>, <4 x i64>*
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i64>, <4 x i64>*
   %b10 = getelementptr inbounds <4 x i64>, <4 x i64>* undef, i32 1
-;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x float>, <4 x float>*
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x float>, <4 x float>*
   %b11 = getelementptr inbounds <4 x float>, <4 x float>* undef, i32 1
-;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x double>, <4 x double>*
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x double>, <4 x double>*
   %b12 = getelementptr inbounds <4 x double>, <4 x double>* undef, i32 1
 
 ;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i8, i8*
@@ -63,15 +63,15 @@ define void @test_geps(i32 %i) {
   %c1 = getelementptr inbounds i16, i16* undef, i32 %i
 ;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i32, i32*
   %c2 = getelementptr inbounds i32, i32* undef, i32 %i
-;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds i64, i64*
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds i64, i64*
   %c3 = getelementptr inbounds i64, i64* undef, i32 %i
-;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds float, float*
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds float, float*
   %c4 = getelementptr inbounds float, float* undef, i32 %i
-;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds double, double*
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds double, double*
   %c5 = getelementptr inbounds double, double* undef, i32 %i
-;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i8>, <4 x i8>*
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i8>, <4 x i8>*
   %c7 = getelementptr inbounds <4 x i8>, <4 x i8>* undef, i32 %i
-;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds <4 x i16>, <4 x i16>*
+;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i16>, <4 x i16>*
   %c8 = getelementptr inbounds <4 x i16>, <4 x i16>* undef, i32 %i
   ; Thumb-2 cannot fold scales larger than 8 to address computation.
 ;CHECK: cost of 1 for instruction: {{.*}} getelementptr inbounds <4 x i32>, <4 x i32>*
diff --git a/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll b/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll
new file mode 100644
index 000000000000..4afeabca00ad
--- /dev/null
+++ b/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll
@@ -0,0 +1,19 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @loads(i32 %arg) {
+  ; CHECK: cost of 1 {{.*}} load
+  load <4 x i8>, <4 x i8>* undef, align 1
+
+  ; CHECK: cost of 1 {{.*}} load
+  load <8 x i8>, <8 x i8>* undef, align 1
+
+  ; CHECK: cost of 1 {{.*}} load
+  load <2 x i16>, <2 x i16>* undef, align 2
+
+  ; CHECK: cost of 1 {{.*}} load
+  load <4 x i16>, <4 x i16>* undef, align 2
+
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/arith-fp.ll b/test/Analysis/CostModel/X86/arith-fp.ll
new file mode 100644
index 000000000000..689442f67a13
--- /dev/null
+++ b/test/Analysis/CostModel/X86/arith-fp.ll
@@ -0,0 +1,544 @@
+; RUN: opt < %s -enable-no-nans-fp-math  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
+; RUN: opt < %s -enable-no-nans-fp-math  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42
+; RUN: opt < %s -enable-no-nans-fp-math  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: opt < %s -enable-no-nans-fp-math  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: opt < %s -enable-no-nans-fp-math  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -enable-no-nans-fp-math  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: 'fadd'
+define i32 @fadd(i32 %arg) {
+  ; SSE2: cost of 2 {{.*}} %F32 = fadd
+  ; SSE42: cost of 2 {{.*}} %F32 = fadd
+  ; AVX: cost of 2 {{.*}} %F32 = fadd
+  ; AVX2: cost of 2 {{.*}} %F32 = fadd
+  ; AVX512: cost of 2 {{.*}} %F32 = fadd
+  %F32 = fadd float undef, undef
+  ; SSE2: cost of 2 {{.*}} %V4F32 = fadd
+  ; SSE42: cost of 2 {{.*}} %V4F32 = fadd
+  ; AVX: cost of 2 {{.*}} %V4F32 = fadd
+  ; AVX2: cost of 2 {{.*}} %V4F32 = fadd
+  ; AVX512: cost of 2 {{.*}} %V4F32 = fadd
+  %V4F32 = fadd <4 x float> undef, undef
+  ; SSE2: cost of 4 {{.*}} %V8F32 = fadd
+  ; SSE42: cost of 4 {{.*}} %V8F32 = fadd
+  ; AVX: cost of 2 {{.*}} %V8F32 = fadd
+  ; AVX2: cost of 2 {{.*}} %V8F32 = fadd
+  ; AVX512: cost of 2 {{.*}} %V8F32 = fadd
+  %V8F32 = fadd <8 x float> undef, undef
+  ; SSE2: cost of 8 {{.*}} %V16F32 = fadd
+  ; SSE42: cost of 8 {{.*}} %V16F32 = fadd
+  ; AVX: cost of 4 {{.*}} %V16F32 = fadd
+  ; AVX2: cost of 4 {{.*}} %V16F32 = fadd
+  ; AVX512: cost of 2 {{.*}} %V16F32 = fadd
+  %V16F32 = fadd <16 x float> undef, undef
+
+  ; SSE2: cost of 2 {{.*}} %F64 = fadd
+  ; SSE42: cost of 2 {{.*}} %F64 = fadd
+  ; AVX: cost of 2 {{.*}} %F64 = fadd
+  ; AVX2: cost of 2 {{.*}} %F64 = fadd
+  ; AVX512: cost of 2 {{.*}} %F64 = fadd
+  %F64 = fadd double undef, undef
+  ; SSE2: cost of 2 {{.*}} %V2F64 = fadd
+  ; SSE42: cost of 2 {{.*}} %V2F64 = fadd
+  ; AVX: cost of 2 {{.*}} %V2F64 = fadd
+  ; AVX2: cost of 2 {{.*}} %V2F64 = fadd
+  ; AVX512: cost of 2 {{.*}} %V2F64 = fadd
+  %V2F64 = fadd <2 x double> undef, undef
+  ; SSE2: cost of 4 {{.*}} %V4F64 = fadd
+  ; SSE42: cost of 4 {{.*}} %V4F64 = fadd
+  ; AVX: cost of 2 {{.*}} %V4F64 = fadd
+  ; AVX2: cost of 2 {{.*}} %V4F64 = fadd
+  ; AVX512: cost of 2 {{.*}} %V4F64 = fadd
+  %V4F64 = fadd <4 x double> undef, undef
+  ; SSE2: cost of 8 {{.*}} %V8F64 = fadd
+  ; SSE42: cost of 8 {{.*}} %V8F64 = fadd
+  ; AVX: cost of 4 {{.*}} %V8F64 = fadd
+  ; AVX2: cost of 4 {{.*}} %V8F64 = fadd
+  ; AVX512: cost of 2 {{.*}} %V8F64 = fadd
+  %V8F64 = fadd <8 x double> undef, undef
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fsub'
+define i32 @fsub(i32 %arg) {
+  ; SSE2: cost of 2 {{.*}} %F32 = fsub
+  ; SSE42: cost of 2 {{.*}} %F32 = fsub
+  ; AVX: cost of 2 {{.*}} %F32 = fsub
+  ; AVX2: cost of 2 {{.*}} %F32 = fsub
+  ; AVX512: cost of 2 {{.*}} %F32 = fsub
+  %F32 = fsub float undef, undef
+  ; SSE2: cost of 2 {{.*}} %V4F32 = fsub
+  ; SSE42: cost of 2 {{.*}} %V4F32 = fsub
+  ; AVX: cost of 2 {{.*}} %V4F32 = fsub
+  ; AVX2: cost of 2 {{.*}} %V4F32 = fsub
+  ; AVX512: cost of 2 {{.*}} %V4F32 = fsub
+  %V4F32 = fsub <4 x float> undef, undef
+  ; SSE2: cost of 4 {{.*}} %V8F32 = fsub
+  ; SSE42: cost of 4 {{.*}} %V8F32 = fsub
+  ; AVX: cost of 2 {{.*}} %V8F32 = fsub
+  ; AVX2: cost of 2 {{.*}} %V8F32 = fsub
+  ; AVX512: cost of 2 {{.*}} %V8F32 = fsub
+  %V8F32 = fsub <8 x float> undef, undef
+  ; SSE2: cost of 8 {{.*}} %V16F32 = fsub
+  ; SSE42: cost of 8 {{.*}} %V16F32 = fsub
+  ; AVX: cost of 4 {{.*}} %V16F32 = fsub
+  ; AVX2: cost of 4 {{.*}} %V16F32 = fsub
+  ; AVX512: cost of 2 {{.*}} %V16F32 = fsub
+  %V16F32 = fsub <16 x float> undef, undef
+
+  ; SSE2: cost of 2 {{.*}} %F64 = fsub
+  ; SSE42: cost of 2 {{.*}} %F64 = fsub
+  ; AVX: cost of 2 {{.*}} %F64 = fsub
+  ; AVX2: cost of 2 {{.*}} %F64 = fsub
+  ; AVX512: cost of 2 {{.*}} %F64 = fsub
+  %F64 = fsub double undef, undef
+  ; SSE2: cost of 2 {{.*}} %V2F64 = fsub
+  ; SSE42: cost of 2 {{.*}} %V2F64 = fsub
+  ; AVX: cost of 2 {{.*}} %V2F64 = fsub
+  ; AVX2: cost of 2 {{.*}} %V2F64 = fsub
+  ; AVX512: cost of 2 {{.*}} %V2F64 = fsub
+  %V2F64 = fsub <2 x double> undef, undef
+  ; SSE2: cost of 4 {{.*}} %V4F64 = fsub
+  ; SSE42: cost of 4 {{.*}} %V4F64 = fsub
+  ; AVX: cost of 2 {{.*}} %V4F64 = fsub
+  ; AVX2: cost of 2 {{.*}} %V4F64 = fsub
+  ; AVX512: cost of 2 {{.*}} %V4F64 = fsub
+  %V4F64 = fsub <4 x double> undef, undef
+  ; SSE2: cost of 8 {{.*}} %V8F64 = fsub
+  ; SSE42: cost of 8 {{.*}} %V8F64 = fsub
+  ; AVX: cost of 4 {{.*}} %V8F64 = fsub
+  ; AVX2: cost of 4 {{.*}} %V8F64 = fsub
+  ; AVX512: cost of 2 {{.*}} %V8F64 = fsub
+  %V8F64 = fsub <8 x double> undef, undef
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fmul'
+define i32 @fmul(i32 %arg) {
+  ; SSE2: cost of 2 {{.*}} %F32 = fmul
+  ; SSE42: cost of 2 {{.*}} %F32 = fmul
+  ; AVX: cost of 2 {{.*}} %F32 = fmul
+  ; AVX2: cost of 2 {{.*}} %F32 = fmul
+  ; AVX512: cost of 2 {{.*}} %F32 = fmul
+  %F32 = fmul float undef, undef
+  ; SSE2: cost of 2 {{.*}} %V4F32 = fmul
+  ; SSE42: cost of 2 {{.*}} %V4F32 = fmul
+  ; AVX: cost of 2 {{.*}} %V4F32 = fmul
+  ; AVX2: cost of 2 {{.*}} %V4F32 = fmul
+  ; AVX512: cost of 2 {{.*}} %V4F32 = fmul
+  %V4F32 = fmul <4 x float> undef, undef
+  ; SSE2: cost of 4 {{.*}} %V8F32 = fmul
+  ; SSE42: cost of 4 {{.*}} %V8F32 = fmul
+  ; AVX: cost of 2 {{.*}} %V8F32 = fmul
+  ; AVX2: cost of 2 {{.*}} %V8F32 = fmul
+  ; AVX512: cost of 2 {{.*}} %V8F32 = fmul
+  %V8F32 = fmul <8 x float> undef, undef
+  ; SSE2: cost of 8 {{.*}} %V16F32 = fmul
+  ; SSE42: cost of 8 {{.*}} %V16F32 = fmul
+  ; AVX: cost of 4 {{.*}} %V16F32 = fmul
+  ; AVX2: cost of 4 {{.*}} %V16F32 = fmul
+  ; AVX512: cost of 2 {{.*}} %V16F32 = fmul
+  %V16F32 = fmul <16 x float> undef, undef
+
+  ; SSE2: cost of 2 {{.*}} %F64 = fmul
+  ; SSE42: cost of 2 {{.*}} %F64 = fmul
+  ; AVX: cost of 2 {{.*}} %F64 = fmul
+  ; AVX2: cost of 2 {{.*}} %F64 = fmul
+  ; AVX512: cost of 2 {{.*}} %F64 = fmul
+  %F64 = fmul double undef, undef
+  ; SSE2: cost of 2 {{.*}} %V2F64 = fmul
+  ; SSE42: cost of 2 {{.*}} %V2F64 = fmul
+  ; AVX: cost of 2 {{.*}} %V2F64 = fmul
+  ; AVX2: cost of 2 {{.*}} %V2F64 = fmul
+  ; AVX512: cost of 2 {{.*}} %V2F64 = fmul
+  %V2F64 = fmul <2 x double> undef, undef
+  ; SSE2: cost of 4 {{.*}} %V4F64 = fmul
+  ; SSE42: cost of 4 {{.*}} %V4F64 = fmul
+  ; AVX: cost of 2 {{.*}} %V4F64 = fmul
+  ; AVX2: cost of 2 {{.*}} %V4F64 = fmul
+  ; AVX512: cost of 2 {{.*}} %V4F64 = fmul
+  %V4F64 = fmul <4 x double> undef, undef
+  ; SSE2: cost of 8 {{.*}} %V8F64 = fmul
+  ; SSE42: cost of 8 {{.*}} %V8F64 = fmul
+  ; AVX: cost of 4 {{.*}} %V8F64 = fmul
+  ; AVX2: cost of 4 {{.*}} %V8F64 = fmul
+  ; AVX512: cost of 2 {{.*}} %V8F64 = fmul
+  %V8F64 = fmul <8 x double> undef, undef
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fdiv'
+define i32 @fdiv(i32 %arg) {
+  ; SSE2: cost of 23 {{.*}} %F32 = fdiv
+  ; SSE42: cost of 14 {{.*}} %F32 = fdiv
+  ; AVX: cost of 14 {{.*}} %F32 = fdiv
+  ; AVX2: cost of 7 {{.*}} %F32 = fdiv
+  ; AVX512: cost of 7 {{.*}} %F32 = fdiv
+  %F32 = fdiv float undef, undef
+  ; SSE2: cost of 39 {{.*}} %V4F32 = fdiv
+  ; SSE42: cost of 14 {{.*}} %V4F32 = fdiv
+  ; AVX: cost of 14 {{.*}} %V4F32 = fdiv
+  ; AVX2: cost of 7 {{.*}} %V4F32 = fdiv
+  ; AVX512: cost of 7 {{.*}} %V4F32 = fdiv
+  %V4F32 = fdiv <4 x float> undef, undef
+  ; SSE2: cost of 78 {{.*}} %V8F32 = fdiv
+  ; SSE42: cost of 28 {{.*}} %V8F32 = fdiv
+  ; AVX: cost of 28 {{.*}} %V8F32 = fdiv
+  ; AVX2: cost of 14 {{.*}} %V8F32 = fdiv
+  ; AVX512: cost of 14 {{.*}} %V8F32 = fdiv
+  %V8F32 = fdiv <8 x float> undef, undef
+  ; SSE2: cost of 156 {{.*}} %V16F32 = fdiv
+  ; SSE42: cost of 56 {{.*}} %V16F32 = fdiv
+  ; AVX: cost of 56 {{.*}} %V16F32 = fdiv
+  ; AVX2: cost of 28 {{.*}} %V16F32 = fdiv
+  ; AVX512: cost of 2 {{.*}} %V16F32 = fdiv
+  %V16F32 = fdiv <16 x float> undef, undef
+
+  ; SSE2: cost of 38 {{.*}} %F64 = fdiv
+  ; SSE42: cost of 22 {{.*}} %F64 = fdiv
+  ; AVX: cost of 22 {{.*}} %F64 = fdiv
+  ; AVX2: cost of 14 {{.*}} %F64 = fdiv
+  ; AVX512: cost of 14 {{.*}} %F64 = fdiv
+  %F64 = fdiv double undef, undef
+  ; SSE2: cost of 69 {{.*}} %V2F64 = fdiv
+  ; SSE42: cost of 22 {{.*}} %V2F64 = fdiv
+  ; AVX: cost of 22 {{.*}} %V2F64 = fdiv
+  ; AVX2: cost of 14 {{.*}} %V2F64 = fdiv
+  ; AVX512: cost of 14 {{.*}} %V2F64 = fdiv
+  %V2F64 = fdiv <2 x double> undef, undef
+  ; SSE2: cost of 138 {{.*}} %V4F64 = fdiv
+  ; SSE42: cost of 44 {{.*}} %V4F64 = fdiv
+  ; AVX: cost of 44 {{.*}} %V4F64 = fdiv
+  ; AVX2: cost of 28 {{.*}} %V4F64 = fdiv
+  ; AVX512: cost of 28 {{.*}} %V4F64 = fdiv
+  %V4F64 = fdiv <4 x double> undef, undef
+  ; SSE2: cost of 276 {{.*}} %V8F64 = fdiv
+  ; SSE42: cost of 88 {{.*}} %V8F64 = fdiv
+  ; AVX: cost of 88 {{.*}} %V8F64 = fdiv
+  ; AVX2: cost of 56 {{.*}} %V8F64 = fdiv
+  ; AVX512: cost of 2 {{.*}} %V8F64 = fdiv
+  %V8F64 = fdiv <8 x double> undef, undef
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'frem'
+define i32 @frem(i32 %arg) {
+  ; SSE2: cost of 2 {{.*}} %F32 = frem
+  ; SSE42: cost of 2 {{.*}} %F32 = frem
+  ; AVX: cost of 2 {{.*}} %F32 = frem
+  ; AVX2: cost of 2 {{.*}} %F32 = frem
+  ; AVX512: cost of 2 {{.*}} %F32 = frem
+  %F32 = frem float undef, undef
+  ; SSE2: cost of 14 {{.*}} %V4F32 = frem
+  ; SSE42: cost of 14 {{.*}} %V4F32 = frem
+  ; AVX: cost of 14 {{.*}} %V4F32 = frem
+  ; AVX2: cost of 14 {{.*}} %V4F32 = frem
+  ; AVX512: cost of 14 {{.*}} %V4F32 = frem
+  %V4F32 = frem <4 x float> undef, undef
+  ; SSE2: cost of 28 {{.*}} %V8F32 = frem
+  ; SSE42: cost of 28 {{.*}} %V8F32 = frem
+  ; AVX: cost of 30 {{.*}} %V8F32 = frem
+  ; AVX2: cost of 30 {{.*}} %V8F32 = frem
+  ; AVX512: cost of 30 {{.*}} %V8F32 = frem
+  %V8F32 = frem <8 x float> undef, undef
+  ; SSE2: cost of 56 {{.*}} %V16F32 = frem
+  ; SSE42: cost of 56 {{.*}} %V16F32 = frem
+  ; AVX: cost of 60 {{.*}} %V16F32 = frem
+  ; AVX2: cost of 60 {{.*}} %V16F32 = frem
+  ; AVX512: cost of 62 {{.*}} %V16F32 = frem
+  %V16F32 = frem <16 x float> undef, undef
+
+  ; SSE2: cost of 2 {{.*}} %F64 = frem
+  ; SSE42: cost of 2 {{.*}} %F64 = frem
+  ; AVX: cost of 2 {{.*}} %F64 = frem
+  ; AVX2: cost of 2 {{.*}} %F64 = frem
+  ; AVX512: cost of 2 {{.*}} %F64 = frem
+  %F64 = frem double undef, undef
+  ; SSE2: cost of 6 {{.*}} %V2F64 = frem
+  ; SSE42: cost of 6 {{.*}} %V2F64 = frem
+  ; AVX: cost of 6 {{.*}} %V2F64 = frem
+  ; AVX2: cost of 6 {{.*}} %V2F64 = frem
+  ; AVX512: cost of 6 {{.*}} %V2F64 = frem
+  %V2F64 = frem <2 x double> undef, undef
+  ; SSE2: cost of 12 {{.*}} %V4F64 = frem
+  ; SSE42: cost of 12 {{.*}} %V4F64 = frem
+  ; AVX: cost of 14 {{.*}} %V4F64 = frem
+  ; AVX2: cost of 14 {{.*}} %V4F64 = frem
+  ; AVX512: cost of 14 {{.*}} %V4F64 = frem
+  %V4F64 = frem <4 x double> undef, undef
+  ; SSE2: cost of 24 {{.*}} %V8F64 = frem
+  ; SSE42: cost of 24 {{.*}} %V8F64 = frem
+  ; AVX: cost of 28 {{.*}} %V8F64 = frem
+  ; AVX2: cost of 28 {{.*}} %V8F64 = frem
+  ; AVX512: cost of 30 {{.*}} %V8F64 = frem
+  %V8F64 = frem <8 x double> undef, undef
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fsqrt'
+define i32 @fsqrt(i32 %arg) {
+  ; SSE2: cost of 28 {{.*}} %F32 = call float @llvm.sqrt.f32
+  ; SSE42: cost of 18 {{.*}} %F32 = call float @llvm.sqrt.f32
+  ; AVX: cost of 14 {{.*}} %F32 = call float @llvm.sqrt.f32
+  ; AVX2: cost of 7 {{.*}} %F32 = call float @llvm.sqrt.f32
+  ; AVX512: cost of 7 {{.*}} %F32 = call float @llvm.sqrt.f32
+  %F32 = call float @llvm.sqrt.f32(float undef)
+  ; SSE2: cost of 56 {{.*}} %V4F32 = call <4 x float> @llvm.sqrt.v4f32
+  ; SSE42: cost of 18 {{.*}} %V4F32 = call <4 x float> @llvm.sqrt.v4f32
+  ; AVX: cost of 14 {{.*}} %V4F32 = call <4 x float> @llvm.sqrt.v4f32
+  ; AVX2: cost of 7 {{.*}} %V4F32 = call <4 x float> @llvm.sqrt.v4f32
+  ; AVX512: cost of 7 {{.*}} %V4F32 = call <4 x float> @llvm.sqrt.v4f32
+  %V4F32 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
+  ; SSE2: cost of 112 {{.*}} %V8F32 = call <8 x float> @llvm.sqrt.v8f32
+  ; SSE42: cost of 36 {{.*}} %V8F32 = call <8 x float> @llvm.sqrt.v8f32
+  ; AVX: cost of 28 {{.*}} %V8F32 = call <8 x float> @llvm.sqrt.v8f32
+  ; AVX2: cost of 14 {{.*}} %V8F32 = call <8 x float> @llvm.sqrt.v8f32
+  ; AVX512: cost of 14 {{.*}} %V8F32 = call <8 x float> @llvm.sqrt.v8f32
+  %V8F32 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> undef)
+  ; SSE2: cost of 224 {{.*}} %V16F32 = call <16 x float> @llvm.sqrt.v16f32
+  ; SSE42: cost of 72 {{.*}} %V16F32 = call <16 x float> @llvm.sqrt.v16f32
+  ; AVX: cost of 56 {{.*}} %V16F32 = call <16 x float> @llvm.sqrt.v16f32
+  ; AVX2: cost of 28 {{.*}} %V16F32 = call <16 x float> @llvm.sqrt.v16f32
+  ; AVX512: cost of 1 {{.*}} %V16F32 = call <16 x float> @llvm.sqrt.v16f32
+  %V16F32 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> undef)
+
+  ; SSE2: cost of 32 {{.*}} %F64 = call double @llvm.sqrt.f64
+  ; SSE42: cost of 32 {{.*}} %F64 = call double @llvm.sqrt.f64
+  ; AVX: cost of 21 {{.*}} %F64 = call double @llvm.sqrt.f64
+  ; AVX2: cost of 14 {{.*}} %F64 = call double @llvm.sqrt.f64
+  ; AVX512: cost of 14 {{.*}} %F64 = call double @llvm.sqrt.f64
+  %F64 = call double @llvm.sqrt.f64(double undef)
+  ; SSE2: cost of 32 {{.*}} %V2F64 = call <2 x double> @llvm.sqrt.v2f64
+  ; SSE42: cost of 32 {{.*}} %V2F64 = call <2 x double> @llvm.sqrt.v2f64
+  ; AVX: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.sqrt.v2f64
+  ; AVX2: cost of 14 {{.*}} %V2F64 = call <2 x double> @llvm.sqrt.v2f64
+  ; AVX512: cost of 14 {{.*}} %V2F64 = call <2 x double> @llvm.sqrt.v2f64
+  %V2F64 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> undef)
+  ; SSE2: cost of 64 {{.*}} %V4F64 = call <4 x double> @llvm.sqrt.v4f64
+  ; SSE42: cost of 64 {{.*}} %V4F64 = call <4 x double> @llvm.sqrt.v4f64
+  ; AVX: cost of 43 {{.*}} %V4F64 = call <4 x double> @llvm.sqrt.v4f64
+  ; AVX2: cost of 28 {{.*}} %V4F64 = call <4 x double> @llvm.sqrt.v4f64
+  ; AVX512: cost of 28 {{.*}} %V4F64 = call <4 x double> @llvm.sqrt.v4f64
+  %V4F64 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> undef)
+  ; SSE2: cost of 128 {{.*}} %V8F64 = call <8 x double> @llvm.sqrt.v8f64
+  ; SSE42: cost of 128 {{.*}} %V8F64 = call <8 x double> @llvm.sqrt.v8f64
+  ; AVX: cost of 86 {{.*}} %V8F64 = call <8 x double> @llvm.sqrt.v8f64
+  ; AVX2: cost of 56 {{.*}} %V8F64 = call <8 x double> @llvm.sqrt.v8f64
+  ; AVX512: cost of 1 {{.*}} %V8F64 = call <8 x double> @llvm.sqrt.v8f64
+  %V8F64 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fabs'
+define i32 @fabs(i32 %arg) {
+  ; SSE2: cost of 2 {{.*}} %F32 = call float @llvm.fabs.f32
+  ; SSE42: cost of 2 {{.*}} %F32 = call float @llvm.fabs.f32
+  ; AVX: cost of 2 {{.*}} %F32 = call float @llvm.fabs.f32
+  ; AVX2: cost of 2 {{.*}} %F32 = call float @llvm.fabs.f32
+  ; AVX512: cost of 2 {{.*}} %F32 = call float @llvm.fabs.f32
+  %F32 = call float @llvm.fabs.f32(float undef)
+  ; SSE2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.fabs.v4f32
+  ; SSE42: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.fabs.v4f32
+  ; AVX: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.fabs.v4f32
+  ; AVX2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.fabs.v4f32
+  ; AVX512: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.fabs.v4f32
+  %V4F32 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+  ; SSE2: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.fabs.v8f32
+  ; SSE42: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.fabs.v8f32
+  ; AVX: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.fabs.v8f32
+  ; AVX2: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.fabs.v8f32
+  ; AVX512: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.fabs.v8f32
+  %V8F32 = call <8 x float> @llvm.fabs.v8f32(<8 x float> undef)
+  ; SSE2: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.fabs.v16f32
+  ; SSE42: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.fabs.v16f32
+  ; AVX: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fabs.v16f32
+  ; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fabs.v16f32
+  ; AVX512: cost of 2 {{.*}} %V16F32 = call <16 x float> @llvm.fabs.v16f32
+  %V16F32 = call <16 x float> @llvm.fabs.v16f32(<16 x float> undef)
+
+  ; SSE2: cost of 2 {{.*}} %F64 = call double @llvm.fabs.f64
+  ; SSE42: cost of 2 {{.*}} %F64 = call double @llvm.fabs.f64
+  ; AVX: cost of 2 {{.*}} %F64 = call double @llvm.fabs.f64
+  ; AVX2: cost of 2 {{.*}} %F64 = call double @llvm.fabs.f64
+  ; AVX512: cost of 2 {{.*}} %F64 = call double @llvm.fabs.f64
+  %F64 = call double @llvm.fabs.f64(double undef)
+  ; SSE2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.fabs.v2f64
+  ; SSE42: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.fabs.v2f64
+  ; AVX: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.fabs.v2f64
+  ; AVX2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.fabs.v2f64
+  ; AVX512: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.fabs.v2f64
+  %V2F64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef)
+  ; SSE2: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.fabs.v4f64
+  ; SSE42: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.fabs.v4f64
+  ; AVX: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.fabs.v4f64
+  ; AVX2: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.fabs.v4f64
+  ; AVX512: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.fabs.v4f64
+  %V4F64 = call <4 x double> @llvm.fabs.v4f64(<4 x double> undef)
+  ; SSE2: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.fabs.v8f64
+  ; SSE42: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.fabs.v8f64
+  ; AVX: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fabs.v8f64
+  ; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fabs.v8f64
+  ; AVX512: cost of 2 {{.*}} %V8F64 = call <8 x double> @llvm.fabs.v8f64
+  %V8F64 = call <8 x double> @llvm.fabs.v8f64(<8 x double> undef)
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fcopysign'
+define i32 @fcopysign(i32 %arg) {
+  ; SSE2: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
+  ; SSE42: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
+  ; AVX: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
+  ; AVX2: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
+  ; AVX512: cost of 2 {{.*}} %F32 = call float @llvm.copysign.f32
+  %F32 = call float @llvm.copysign.f32(float undef, float undef)
+  ; SSE2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+  ; SSE42: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+  ; AVX: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+  ; AVX2: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+  ; AVX512: cost of 2 {{.*}} %V4F32 = call <4 x float> @llvm.copysign.v4f32
+  %V4F32 = call <4 x float> @llvm.copysign.v4f32(<4 x float> undef, <4 x float> undef)
+  ; SSE2: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+  ; SSE42: cost of 4 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+  ; AVX: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+  ; AVX2: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+  ; AVX512: cost of 2 {{.*}} %V8F32 = call <8 x float> @llvm.copysign.v8f32
+  %V8F32 = call <8 x float> @llvm.copysign.v8f32(<8 x float> undef, <8 x float> undef)
+  ; SSE2: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+  ; SSE42: cost of 8 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+  ; AVX: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+  ; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+  ; AVX512: cost of 2 {{.*}} %V16F32 = call <16 x float> @llvm.copysign.v16f32
+  %V16F32 = call <16 x float> @llvm.copysign.v16f32(<16 x float> undef, <16 x float> undef)
+
+  ; SSE2: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
+  ; SSE42: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
+  ; AVX: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
+  ; AVX2: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
+  ; AVX512: cost of 2 {{.*}} %F64 = call double @llvm.copysign.f64
+  %F64 = call double @llvm.copysign.f64(double undef, double undef)
+  ; SSE2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+  ; SSE42: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+  ; AVX: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+  ; AVX2: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+  ; AVX512: cost of 2 {{.*}} %V2F64 = call <2 x double> @llvm.copysign.v2f64
+  %V2F64 = call <2 x double> @llvm.copysign.v2f64(<2 x double> undef, <2 x double> undef)
+  ; SSE2: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+  ; SSE42: cost of 4 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+  ; AVX: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+  ; AVX2: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+  ; AVX512: cost of 2 {{.*}} %V4F64 = call <4 x double> @llvm.copysign.v4f64
+  %V4F64 = call <4 x double> @llvm.copysign.v4f64(<4 x double> undef, <4 x double> undef)
+  ; SSE2: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+  ; SSE42: cost of 8 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+  ; AVX: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+  ; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+  ; AVX512: cost of 2 {{.*}} %V8F64 = call <8 x double> @llvm.copysign.v8f64
+  %V8F64 = call <8 x double> @llvm.copysign.v8f64(<8 x double> undef, <8 x double> undef)
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fma'
+define i32 @fma(i32 %arg) {
+  ; SSE2: cost of 10 {{.*}} %F32 = call float @llvm.fma.f32
+  ; SSE42: cost of 10 {{.*}} %F32 = call float @llvm.fma.f32
+  ; AVX: cost of 1 {{.*}} %F32 = call float @llvm.fma.f32
+  ; AVX2: cost of 1 {{.*}} %F32 = call float @llvm.fma.f32
+  ; AVX512: cost of 1 {{.*}} %F32 = call float @llvm.fma.f32
+  %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
+  ; SSE2: cost of 52 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
+  ; SSE42: cost of 52 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
+  ; AVX: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
+  ; AVX2: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
+  ; AVX512: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
+  %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
+  ; SSE2: cost of 104 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
+  ; SSE42: cost of 104 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
+  ; AVX: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
+  ; AVX2: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
+  ; AVX512: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
+  %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
+  ; SSE2: cost of 208 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
+  ; SSE42: cost of 208 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
+  ; AVX: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
+  ; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
+  ; AVX512: cost of 1 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
+  %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
+
+  ; SSE2: cost of 10 {{.*}} %F64 = call double @llvm.fma.f64
+  ; SSE42: cost of 10 {{.*}} %F64 = call double @llvm.fma.f64
+  ; AVX: cost of 1 {{.*}} %F64 = call double @llvm.fma.f64
+  ; AVX2: cost of 1 {{.*}} %F64 = call double @llvm.fma.f64
+  ; AVX512: cost of 1 {{.*}} %F64 = call double @llvm.fma.f64
+  %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
+  ; SSE2: cost of 24 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
+  ; SSE42: cost of 24 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
+  ; AVX: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
+  ; AVX2: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
+  ; AVX512: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
+  %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
+  ; SSE2: cost of 48 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
+  ; SSE42: cost of 48 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
+  ; AVX: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
+  ; AVX2: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
+  ; AVX512: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
+  %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
+  ; SSE2: cost of 96 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
+  ; SSE42: cost of 96 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
+  ; AVX: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
+  ; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
+  ; AVX512: cost of 1 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
+  %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
+
+  ret i32 undef
+}
+
+declare float @llvm.sqrt.f32(float)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
+declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
+
+declare double @llvm.sqrt.f64(double)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
+
+declare float @llvm.fabs.f32(float)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
+declare <16 x float> @llvm.fabs.v16f32(<16 x float>)
+
+declare double @llvm.fabs.f64(double)
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
+declare <4 x double> @llvm.fabs.v4f64(<4 x double>)
+declare <8 x double> @llvm.fabs.v8f64(<8 x double>)
+
+declare float @llvm.copysign.f32(float, float)
+declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
+declare <8 x float> @llvm.copysign.v8f32(<8 x float>, <8 x float>)
+declare <16 x float> @llvm.copysign.v16f32(<16 x float>, <16 x float>)
+
+declare double @llvm.copysign.f64(double, double)
+declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>)
+declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>)
+declare <8 x double> @llvm.copysign.v8f64(<8 x double>, <8 x double>)
+
+declare float @llvm.fma.f32(float, float, float)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
+declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>)
+
+declare double @llvm.fma.f64(double, double, double)
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
+declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>)
diff --git a/test/Analysis/CostModel/X86/arith.ll b/test/Analysis/CostModel/X86/arith.ll
index a35db9c68ffb..7319efb413d6 100644
--- a/test/Analysis/CostModel/X86/arith.ll
+++ b/test/Analysis/CostModel/X86/arith.ll
@@ -2,6 +2,9 @@
 ; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42
 ; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
 ; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512DQ
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -12,27 +15,246 @@ define i32 @add(i32 %arg) {
   ; SSE42: cost of 1 {{.*}} %A = add
   ; AVX: cost of 1 {{.*}} %A = add
   ; AVX2: cost of 1 {{.*}} %A = add
-  %A = add <4 x i32> undef, undef
+  ; AVX512: cost of 1 {{.*}} %A = add
+  %A = add <2 x i64> undef, undef
   ; SSSE3: cost of 2 {{.*}} %B = add
   ; SSE42: cost of 2 {{.*}} %B = add
-  ; AVX: cost of 4 {{.*}} %B = add  
+  ; AVX: cost of 4 {{.*}} %B = add
   ; AVX2: cost of 1 {{.*}} %B = add
-  %B = add <8 x i32> undef, undef
-  ; SSSE3: cost of 1 {{.*}} %C = add
-  ; SSE42: cost of 1 {{.*}} %C = add
-  ; AVX: cost of 1 {{.*}} %C = add
-  ; AVX2: cost of 1 {{.*}} %C = add
-  %C = add <2 x i64> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %D = add
-  ; SSE42: cost of 2 {{.*}} %D = add
-  ; AVX: cost of 4 {{.*}} %D = add
+  ; AVX512: cost of 1 {{.*}} %B = add
+  %B = add <4 x i64> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %C = add
+  ; SSE42: cost of 4 {{.*}} %C = add
+  ; AVX: cost of 8 {{.*}} %C = add
+  ; AVX2: cost of 2 {{.*}} %C = add
+  ; AVX512: cost of 1 {{.*}} %C = add
+  %C = add <8 x i64> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %D = add
+  ; SSE42: cost of 1 {{.*}} %D = add
+  ; AVX: cost of 1 {{.*}} %D = add
   ; AVX2: cost of 1 {{.*}} %D = add
-  %D = add <4 x i64> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %E = add
-  ; SSE42: cost of 4 {{.*}} %E = add
-  ; AVX: cost of 8 {{.*}} %E = add
-  ; AVX2: cost of 2 {{.*}} %E = add
-  %E = add <8 x i64> undef, undef
+  ; AVX512: cost of 1 {{.*}} %D = add
+  %D = add <4 x i32> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %E = add
+  ; SSE42: cost of 2 {{.*}} %E = add
+  ; AVX: cost of 4 {{.*}} %E = add
+  ; AVX2: cost of 1 {{.*}} %E = add
+  ; AVX512: cost of 1 {{.*}} %E = add
+  %E = add <8 x i32> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %F = add
+  ; SSE42: cost of 4 {{.*}} %F = add
+  ; AVX: cost of 8 {{.*}} %F = add
+  ; AVX2: cost of 2 {{.*}} %F = add
+  ; AVX512: cost of 1 {{.*}} %F = add
+  %F = add <16 x i32> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %G = add
+  ; SSE42: cost of 1 {{.*}} %G = add
+  ; AVX: cost of 1 {{.*}} %G = add
+  ; AVX2: cost of 1 {{.*}} %G = add
+  ; AVX512: cost of 1 {{.*}} %G = add
+  %G = add <8 x i16> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %H = add
+  ; SSE42: cost of 2 {{.*}} %H = add
+  ; AVX: cost of 4 {{.*}} %H = add
+  ; AVX2: cost of 1 {{.*}} %H = add
+  ; AVX512: cost of 1 {{.*}} %H = add
+  %H = add <16 x i16> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %I = add
+  ; SSE42: cost of 4 {{.*}} %I = add
+  ; AVX: cost of 8 {{.*}} %I = add
+  ; AVX2: cost of 2 {{.*}} %I = add
+  ; AVX512F: cost of 2 {{.*}} %I = add
+  ; AVX512BW: cost of 1 {{.*}} %I = add
+  %I = add <32 x i16> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %J = add
+  ; SSE42: cost of 1 {{.*}} %J = add
+  ; AVX: cost of 1 {{.*}} %J = add
+  ; AVX2: cost of 1 {{.*}} %J = add
+  ; AVX512: cost of 1 {{.*}} %J = add
+  %J = add <16 x i8> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %K = add
+  ; SSE42: cost of 2 {{.*}} %K = add
+  ; AVX: cost of 4 {{.*}} %K = add
+  ; AVX2: cost of 1 {{.*}} %K = add
+  ; AVX512: cost of 1 {{.*}} %K = add
+  %K = add <32 x i8> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %L = add
+  ; SSE42: cost of 4 {{.*}} %L = add
+  ; AVX: cost of 8 {{.*}} %L = add
+  ; AVX2: cost of 2 {{.*}} %L = add
+  ; AVX512F: cost of 2 {{.*}} %L = add
+  ; AVX512BW: cost of 1 {{.*}} %L = add
+  %L = add <64 x i8> undef, undef
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'sub'
+define i32 @sub(i32 %arg) {
+  ; SSSE3: cost of 1 {{.*}} %A = sub
+  ; SSE42: cost of 1 {{.*}} %A = sub
+  ; AVX: cost of 1 {{.*}} %A = sub
+  ; AVX2: cost of 1 {{.*}} %A = sub
+  ; AVX512: cost of 1 {{.*}} %A = sub
+  %A = sub <2 x i64> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %B = sub
+  ; SSE42: cost of 2 {{.*}} %B = sub
+  ; AVX: cost of 4 {{.*}} %B = sub
+  ; AVX2: cost of 1 {{.*}} %B = sub
+  ; AVX512: cost of 1 {{.*}} %B = sub
+  %B = sub <4 x i64> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %C = sub
+  ; SSE42: cost of 4 {{.*}} %C = sub
+  ; AVX: cost of 8 {{.*}} %C = sub
+  ; AVX2: cost of 2 {{.*}} %C = sub
+  ; AVX512: cost of 1 {{.*}} %C = sub
+  %C = sub <8 x i64> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %D = sub
+  ; SSE42: cost of 1 {{.*}} %D = sub
+  ; AVX: cost of 1 {{.*}} %D = sub
+  ; AVX2: cost of 1 {{.*}} %D = sub
+  ; AVX512: cost of 1 {{.*}} %D = sub
+  %D = sub <4 x i32> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %E = sub
+  ; SSE42: cost of 2 {{.*}} %E = sub
+  ; AVX: cost of 4 {{.*}} %E = sub
+  ; AVX2: cost of 1 {{.*}} %E = sub
+  ; AVX512: cost of 1 {{.*}} %E = sub
+  %E = sub <8 x i32> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %F = sub
+  ; SSE42: cost of 4 {{.*}} %F = sub
+  ; AVX: cost of 8 {{.*}} %F = sub
+  ; AVX2: cost of 2 {{.*}} %F = sub
+  ; AVX512: cost of 1 {{.*}} %F = sub
+  %F = sub <16 x i32> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %G = sub
+  ; SSE42: cost of 1 {{.*}} %G = sub
+  ; AVX: cost of 1 {{.*}} %G = sub
+  ; AVX2: cost of 1 {{.*}} %G = sub
+  ; AVX512: cost of 1 {{.*}} %G = sub
+  %G = sub <8 x i16> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %H = sub
+  ; SSE42: cost of 2 {{.*}} %H = sub
+  ; AVX: cost of 4 {{.*}} %H = sub
+  ; AVX2: cost of 1 {{.*}} %H = sub
+  ; AVX512: cost of 1 {{.*}} %H = sub
+  %H = sub <16 x i16> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %I = sub
+  ; SSE42: cost of 4 {{.*}} %I = sub
+  ; AVX: cost of 8 {{.*}} %I = sub
+  ; AVX2: cost of 2 {{.*}} %I = sub
+  ; AVX512F: cost of 2 {{.*}} %I = sub
+  ; AVX512BW: cost of 1 {{.*}} %I = sub
+  %I = sub <32 x i16> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %J = sub
+  ; SSE42: cost of 1 {{.*}} %J = sub
+  ; AVX: cost of 1 {{.*}} %J = sub
+  ; AVX2: cost of 1 {{.*}} %J = sub
+  ; AVX512: cost of 1 {{.*}} %J = sub
+  %J = sub <16 x i8> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %K = sub
+  ; SSE42: cost of 2 {{.*}} %K = sub
+  ; AVX: cost of 4 {{.*}} %K = sub
+  ; AVX2: cost of 1 {{.*}} %K = sub
+  ; AVX512: cost of 1 {{.*}} %K = sub
+  %K = sub <32 x i8> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %L = sub
+  ; SSE42: cost of 4 {{.*}} %L = sub
+  ; AVX: cost of 8 {{.*}} %L = sub
+  ; AVX2: cost of 2 {{.*}} %L = sub
+  ; AVX512F: cost of 2 {{.*}} %L = sub
+  ; AVX512BW: cost of 1 {{.*}} %L = sub
+  %L = sub <64 x i8> undef, undef
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'or'
+define i32 @or(i32 %arg) {
+  ; SSSE3: cost of 1 {{.*}} %A = or
+  ; SSE42: cost of 1 {{.*}} %A = or
+  ; AVX: cost of 1 {{.*}} %A = or
+  ; AVX2: cost of 1 {{.*}} %A = or
+  ; AVX512: cost of 1 {{.*}} %A = or
+  %A = or <2 x i64> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %B = or
+  ; SSE42: cost of 2 {{.*}} %B = or
+  ; AVX: cost of 1 {{.*}} %B = or
+  ; AVX2: cost of 1 {{.*}} %B = or
+  ; AVX512: cost of 1 {{.*}} %B = or
+  %B = or <4 x i64> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %C = or
+  ; SSE42: cost of 4 {{.*}} %C = or
+  ; AVX: cost of 2 {{.*}} %C = or
+  ; AVX2: cost of 2 {{.*}} %C = or
+  ; AVX512: cost of 1 {{.*}} %C = or
+  %C = or <8 x i64> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %D = or
+  ; SSE42: cost of 1 {{.*}} %D = or
+  ; AVX: cost of 1 {{.*}} %D = or
+  ; AVX2: cost of 1 {{.*}} %D = or
+  ; AVX512: cost of 1 {{.*}} %D = or
+  %D = or <4 x i32> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %E = or
+  ; SSE42: cost of 2 {{.*}} %E = or
+  ; AVX: cost of 1 {{.*}} %E = or
+  ; AVX2: cost of 1 {{.*}} %E = or
+  ; AVX512: cost of 1 {{.*}} %E = or
+  %E = or <8 x i32> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %F = or
+  ; SSE42: cost of 4 {{.*}} %F = or
+  ; AVX: cost of 2 {{.*}} %F = or
+  ; AVX2: cost of 2 {{.*}} %F = or
+  ; AVX512: cost of 1 {{.*}} %F = or
+  %F = or <16 x i32> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %G = or
+  ; SSE42: cost of 1 {{.*}} %G = or
+  ; AVX: cost of 1 {{.*}} %G = or
+  ; AVX2: cost of 1 {{.*}} %G = or
+  ; AVX512: cost of 1 {{.*}} %G = or
+  %G = or <8 x i16> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %H = or
+  ; SSE42: cost of 2 {{.*}} %H = or
+  ; AVX: cost of 1 {{.*}} %H = or
+  ; AVX2: cost of 1 {{.*}} %H = or
+  ; AVX512: cost of 1 {{.*}} %H = or
+  %H = or <16 x i16> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %I = or
+  ; SSE42: cost of 4 {{.*}} %I = or
+  ; AVX: cost of 2 {{.*}} %I = or
+  ; AVX2: cost of 2 {{.*}} %I = or
+  ; AVX512F: cost of 2 {{.*}} %I = or
+  ; AVX512BW: cost of 1 {{.*}} %I = or
+  %I = or <32 x i16> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %J = or
+  ; SSE42: cost of 1 {{.*}} %J = or
+  ; AVX: cost of 1 {{.*}} %J = or
+  ; AVX2: cost of 1 {{.*}} %J = or
+  ; AVX512: cost of 1 {{.*}} %J = or
+  %J = or <16 x i8> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %K = or
+  ; SSE42: cost of 2 {{.*}} %K = or
+  ; AVX: cost of 1 {{.*}} %K = or
+  ; AVX2: cost of 1 {{.*}} %K = or
+  ; AVX512: cost of 1 {{.*}} %K = or
+  %K = or <32 x i8> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %L = or
+  ; SSE42: cost of 4 {{.*}} %L = or
+  ; AVX: cost of 2 {{.*}} %L = or
+  ; AVX2: cost of 2 {{.*}} %L = or
+  ; AVX512F: cost of 2 {{.*}} %L = or
+  ; AVX512BW: cost of 1 {{.*}} %L = or
+  %L = or <64 x i8> undef, undef
+
   ret i32 undef
 }
 
@@ -42,132 +264,270 @@ define i32 @xor(i32 %arg) {
   ; SSE42: cost of 1 {{.*}} %A = xor
   ; AVX: cost of 1 {{.*}} %A = xor
   ; AVX2: cost of 1 {{.*}} %A = xor
-  %A = xor <4 x i32> undef, undef
+  ; AVX512: cost of 1 {{.*}} %A = xor
+  %A = xor <2 x i64> undef, undef
   ; SSSE3: cost of 2 {{.*}} %B = xor
   ; SSE42: cost of 2 {{.*}} %B = xor
   ; AVX: cost of 1 {{.*}} %B = xor
   ; AVX2: cost of 1 {{.*}} %B = xor
-  %B = xor <8 x i32> undef, undef
-  ; SSSE3: cost of 1 {{.*}} %C = xor
-  ; SSE42: cost of 1 {{.*}} %C = xor
-  ; AVX: cost of 1 {{.*}} %C = xor
-  ; AVX2: cost of 1 {{.*}} %C = xor
-  %C = xor <2 x i64> undef, undef
-  ; SSSE3: cost of 2 {{.*}} %D = xor
-  ; SSE42: cost of 2 {{.*}} %D = xor
+  ; AVX512: cost of 1 {{.*}} %B = xor
+  %B = xor <4 x i64> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %C = xor
+  ; SSE42: cost of 4 {{.*}} %C = xor
+  ; AVX: cost of 2 {{.*}} %C = xor
+  ; AVX2: cost of 2 {{.*}} %C = xor
+  ; AVX512: cost of 1 {{.*}} %C = xor
+  %C = xor <8 x i64> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %D = xor
+  ; SSE42: cost of 1 {{.*}} %D = xor
   ; AVX: cost of 1 {{.*}} %D = xor
   ; AVX2: cost of 1 {{.*}} %D = xor
-  %D = xor <4 x i64> undef, undef
+  ; AVX512: cost of 1 {{.*}} %D = xor
+  %D = xor <4 x i32> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %E = xor
+  ; SSE42: cost of 2 {{.*}} %E = xor
+  ; AVX: cost of 1 {{.*}} %E = xor
+  ; AVX2: cost of 1 {{.*}} %E = xor
+  ; AVX512: cost of 1 {{.*}} %E = xor
+  %E = xor <8 x i32> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %F = xor
+  ; SSE42: cost of 4 {{.*}} %F = xor
+  ; AVX: cost of 2 {{.*}} %F = xor
+  ; AVX2: cost of 2 {{.*}} %F = xor
+  ; AVX512: cost of 1 {{.*}} %F = xor
+  %F = xor <16 x i32> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %G = xor
+  ; SSE42: cost of 1 {{.*}} %G = xor
+  ; AVX: cost of 1 {{.*}} %G = xor
+  ; AVX2: cost of 1 {{.*}} %G = xor
+  ; AVX512: cost of 1 {{.*}} %G = xor
+  %G = xor <8 x i16> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %H = xor
+  ; SSE42: cost of 2 {{.*}} %H = xor
+  ; AVX: cost of 1 {{.*}} %H = xor
+  ; AVX2: cost of 1 {{.*}} %H = xor
+  ; AVX512: cost of 1 {{.*}} %H = xor
+  %H = xor <16 x i16> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %I = xor
+  ; SSE42: cost of 4 {{.*}} %I = xor
+  ; AVX: cost of 2 {{.*}} %I = xor
+  ; AVX2: cost of 2 {{.*}} %I = xor
+  ; AVX512F: cost of 2 {{.*}} %I = xor
+  ; AVX512BW: cost of 1 {{.*}} %I = xor
+  %I = xor <32 x i16> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %J = xor
+  ; SSE42: cost of 1 {{.*}} %J = xor
+  ; AVX: cost of 1 {{.*}} %J = xor
+  ; AVX2: cost of 1 {{.*}} %J = xor
+  ; AVX512: cost of 1 {{.*}} %J = xor
+  %J = xor <16 x i8> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %K = xor
+  ; SSE42: cost of 2 {{.*}} %K = xor
+  ; AVX: cost of 1 {{.*}} %K = xor
+  ; AVX2: cost of 1 {{.*}} %K = xor
+  ; AVX512: cost of 1 {{.*}} %K = xor
+  %K = xor <32 x i8> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %L = xor
+  ; SSE42: cost of 4 {{.*}} %L = xor
+  ; AVX: cost of 2 {{.*}} %L = xor
+  ; AVX2: cost of 2 {{.*}} %L = xor
+  ; AVX512F: cost of 2 {{.*}} %L = xor
+  ; AVX512BW: cost of 1 {{.*}} %L = xor
+  %L = xor <64 x i8> undef, undef
+
   ret i32 undef
 }
 
-; CHECK-LABEL: 'mul'
-define void @mul() {
-  ; A <2 x i32> gets expanded to a <2 x i64> vector.
-  ; A <2 x i64> vector multiply is implemented using
-  ; 3 PMULUDQ and 2 PADDS and 4 shifts.
-  ; SSSE3: cost of 9 {{.*}} %A0 = mul
-  ; SSE42: cost of 9 {{.*}} %A0 = mul
-  ; AVX: cost of 9 {{.*}} %A0 = mul
-  ; AVX2: cost of 9 {{.*}} %A0 = mul
-  %A0 = mul <2 x i32> undef, undef
-  ; SSSE3: cost of 6 {{.*}} %A1 = mul
-  ; SSE42: cost of 1 {{.*}} %A1 = mul
-  ; AVX: cost of 1 {{.*}} %A1 = mul
-  ; AVX2: cost of 1 {{.*}} %A1 = mul
-  %A1 = mul <4 x i32> undef, undef  
-  ; SSSE3: cost of 9 {{.*}} %A2 = mul
-  ; SSE42: cost of 9 {{.*}} %A2 = mul
-  ; AVX: cost of 9 {{.*}} %A2 = mul
-  ; AVX2: cost of 9 {{.*}} %A2 = mul
-  %A2 = mul <2 x i64> undef, undef
-  ; SSSE3: cost of 18 {{.*}} %A3 = mul
-  ; SSE42: cost of 18 {{.*}} %A3 = mul
-  ; AVX: cost of 18 {{.*}} %A3 = mul
-  ; AVX2: cost of 9 {{.*}} %A3 = mul
-  %A3 = mul <4 x i64> undef, undef
-  ret void
+; CHECK-LABEL: 'and'
+define i32 @and(i32 %arg) {
+  ; SSSE3: cost of 1 {{.*}} %A = and
+  ; SSE42: cost of 1 {{.*}} %A = and
+  ; AVX: cost of 1 {{.*}} %A = and
+  ; AVX2: cost of 1 {{.*}} %A = and
+  ; AVX512: cost of 1 {{.*}} %A = and
+  %A = and <2 x i64> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %B = and
+  ; SSE42: cost of 2 {{.*}} %B = and
+  ; AVX: cost of 1 {{.*}} %B = and
+  ; AVX2: cost of 1 {{.*}} %B = and
+  ; AVX512: cost of 1 {{.*}} %B = and
+  %B = and <4 x i64> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %C = and
+  ; SSE42: cost of 4 {{.*}} %C = and
+  ; AVX: cost of 2 {{.*}} %C = and
+  ; AVX2: cost of 2 {{.*}} %C = and
+  ; AVX512: cost of 1 {{.*}} %C = and
+  %C = and <8 x i64> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %D = and
+  ; SSE42: cost of 1 {{.*}} %D = and
+  ; AVX: cost of 1 {{.*}} %D = and
+  ; AVX2: cost of 1 {{.*}} %D = and
+  ; AVX512: cost of 1 {{.*}} %D = and
+  %D = and <4 x i32> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %E = and
+  ; SSE42: cost of 2 {{.*}} %E = and
+  ; AVX: cost of 1 {{.*}} %E = and
+  ; AVX2: cost of 1 {{.*}} %E = and
+  ; AVX512: cost of 1 {{.*}} %E = and
+  %E = and <8 x i32> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %F = and
+  ; SSE42: cost of 4 {{.*}} %F = and
+  ; AVX: cost of 2 {{.*}} %F = and
+  ; AVX2: cost of 2 {{.*}} %F = and
+  ; AVX512: cost of 1 {{.*}} %F = and
+  %F = and <16 x i32> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %G = and
+  ; SSE42: cost of 1 {{.*}} %G = and
+  ; AVX: cost of 1 {{.*}} %G = and
+  ; AVX2: cost of 1 {{.*}} %G = and
+  ; AVX512: cost of 1 {{.*}} %G = and
+  %G = and <8 x i16> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %H = and
+  ; SSE42: cost of 2 {{.*}} %H = and
+  ; AVX: cost of 1 {{.*}} %H = and
+  ; AVX2: cost of 1 {{.*}} %H = and
+  ; AVX512: cost of 1 {{.*}} %H = and
+  %H = and <16 x i16> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %I = and
+  ; SSE42: cost of 4 {{.*}} %I = and
+  ; AVX: cost of 2 {{.*}} %I = and
+  ; AVX2: cost of 2 {{.*}} %I = and
+  ; AVX512F: cost of 2 {{.*}} %I = and
+  ; AVX512BW: cost of 1 {{.*}} %I = and
+  %I = and <32 x i16> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %J = and
+  ; SSE42: cost of 1 {{.*}} %J = and
+  ; AVX: cost of 1 {{.*}} %J = and
+  ; AVX2: cost of 1 {{.*}} %J = and
+  ; AVX512: cost of 1 {{.*}} %J = and
+  %J = and <16 x i8> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %K = and
+  ; SSE42: cost of 2 {{.*}} %K = and
+  ; AVX: cost of 1 {{.*}} %K = and
+  ; AVX2: cost of 1 {{.*}} %K = and
+  ; AVX512: cost of 1 {{.*}} %K = and
+  %K = and <32 x i8> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %L = and
+  ; SSE42: cost of 4 {{.*}} %L = and
+  ; AVX: cost of 2 {{.*}} %L = and
+  ; AVX2: cost of 2 {{.*}} %L = and
+  ; AVX512F: cost of 2 {{.*}} %L = and
+  ; AVX512BW: cost of 1 {{.*}} %L = and
+  %L = and <64 x i8> undef, undef
+
+  ret i32 undef
 }
 
-; CHECK-LABEL: 'fmul'
-define i32 @fmul(i32 %arg) {
-  ; SSSE3: cost of 2 {{.*}} %A = fmul
-  ; SSE42: cost of 2 {{.*}} %A = fmul
-  ; AVX: cost of 2 {{.*}} %A = fmul
-  ; AVX2: cost of 2 {{.*}} %A = fmul
-  %A = fmul <4 x float> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %B = fmul
-  ; SSE42: cost of 4 {{.*}} %B = fmul
-  ; AVX: cost of 2 {{.*}} %B = fmul
-  ; AVX2: cost of 2 {{.*}} %B = fmul
-  %B = fmul <8 x float> undef, undef
+; CHECK-LABEL: 'mul'
+define i32 @mul(i32 %arg) {
+  ; SSSE3: cost of 8 {{.*}} %A = mul
+  ; SSE42: cost of 8 {{.*}} %A = mul
+  ; AVX: cost of 8 {{.*}} %A = mul
+  ; AVX2: cost of 8 {{.*}} %A = mul
+  ; AVX512F: cost of 8 {{.*}} %A = mul
+  ; AVX512BW: cost of 8 {{.*}} %A = mul
+  ; AVX512DQ: cost of 1 {{.*}} %A = mul
+  %A = mul <2 x i64> undef, undef
+  ; SSSE3: cost of 16 {{.*}} %B = mul
+  ; SSE42: cost of 16 {{.*}} %B = mul
+  ; AVX: cost of 16 {{.*}} %B = mul
+  ; AVX2: cost of 8 {{.*}} %B = mul
+  ; AVX512F: cost of 8 {{.*}} %B = mul
+  ; AVX512BW: cost of 8 {{.*}} %B = mul
+  ; AVX512DQ: cost of 1 {{.*}} %B = mul
+  %B = mul <4 x i64> undef, undef
+  ; SSSE3: cost of 32 {{.*}} %C = mul
+  ; SSE42: cost of 32 {{.*}} %C = mul
+  ; AVX: cost of 32 {{.*}} %C = mul
+  ; AVX2: cost of 16 {{.*}} %C = mul
+  ; AVX512F: cost of 8 {{.*}} %C = mul
+  ; AVX512BW: cost of 8 {{.*}} %C = mul
+  ; AVX512DQ: cost of 1 {{.*}} %C = mul
+  %C = mul <8 x i64> undef, undef
+
+  ; SSSE3: cost of 6 {{.*}} %D = mul
+  ; SSE42: cost of 1 {{.*}} %D = mul
+  ; AVX: cost of 1 {{.*}} %D = mul
+  ; AVX2: cost of 1 {{.*}} %D = mul
+  ; AVX512: cost of 1 {{.*}} %D = mul
+  %D = mul <4 x i32> undef, undef
+  ; SSSE3: cost of 12 {{.*}} %E = mul
+  ; SSE42: cost of 2 {{.*}} %E = mul
+  ; AVX: cost of 4 {{.*}} %E = mul
+  ; AVX2: cost of 1 {{.*}} %E = mul
+  ; AVX512: cost of 1 {{.*}} %E = mul
+  %E = mul <8 x i32> undef, undef
+  ; SSSE3: cost of 24 {{.*}} %F = mul
+  ; SSE42: cost of 4 {{.*}} %F = mul
+  ; AVX: cost of 8 {{.*}} %F = mul
+  ; AVX2: cost of 2 {{.*}} %F = mul
+  ; AVX512: cost of 1 {{.*}} %F = mul
+  %F = mul <16 x i32> undef, undef
+
+  ; SSSE3: cost of 1 {{.*}} %G = mul
+  ; SSE42: cost of 1 {{.*}} %G = mul
+  ; AVX: cost of 1 {{.*}} %G = mul
+  ; AVX2: cost of 1 {{.*}} %G = mul
+  ; AVX512: cost of 1 {{.*}} %G = mul
+  %G = mul <8 x i16> undef, undef
+  ; SSSE3: cost of 2 {{.*}} %H = mul
+  ; SSE42: cost of 2 {{.*}} %H = mul
+  ; AVX: cost of 4 {{.*}} %H = mul
+  ; AVX2: cost of 1 {{.*}} %H = mul
+  ; AVX512: cost of 1 {{.*}} %H = mul
+  %H = mul <16 x i16> undef, undef
+  ; SSSE3: cost of 4 {{.*}} %I = mul
+  ; SSE42: cost of 4 {{.*}} %I = mul
+  ; AVX: cost of 8 {{.*}} %I = mul
+  ; AVX2: cost of 2 {{.*}} %I = mul
+  ; AVX512F: cost of 2 {{.*}} %I = mul
+  ; AVX512BW: cost of 1 {{.*}} %I = mul
+  %I = mul <32 x i16> undef, undef
+
+  ; SSSE3: cost of 12 {{.*}} %J = mul
+  ; SSE42: cost of 12 {{.*}} %J = mul
+  ; AVX: cost of 12 {{.*}} %J = mul
+  ; AVX2: cost of 7 {{.*}} %J = mul
+  ; AVX512F: cost of 5 {{.*}} %J = mul
+  ; AVX512BW: cost of 4 {{.*}} %J = mul
+  %J = mul <16 x i8> undef, undef
+  ; SSSE3: cost of 24 {{.*}} %K = mul
+  ; SSE42: cost of 24 {{.*}} %K = mul
+  ; AVX: cost of 26 {{.*}} %K = mul
+  ; AVX2: cost of 17 {{.*}} %K = mul
+  ; AVX512F: cost of 13 {{.*}} %K = mul
+  ; AVX512BW: cost of 4 {{.*}} %K = mul
+  %K = mul <32 x i8> undef, undef
+  ; SSSE3: cost of 48 {{.*}} %L = mul
+  ; SSE42: cost of 48 {{.*}} %L = mul
+  ; AVX: cost of 52 {{.*}} %L = mul
+  ; AVX2: cost of 34 {{.*}} %L = mul
+  ; AVX512F: cost of 26 {{.*}} %L = mul
+  ; AVX512BW: cost of 11 {{.*}} %L = mul
+  %L = mul <64 x i8> undef, undef
+
   ret i32 undef
 }
 
-; CHECK-LABEL: 'shift'
-define void @shift() {
-  ; SSSE3: cost of 10 {{.*}} %A0 = shl
-  ; SSE42: cost of 10 {{.*}} %A0 = shl
-  ; AVX: cost of 10 {{.*}} %A0 = shl
-  ; AVX2: cost of 1 {{.*}} %A0 = shl
-  %A0 = shl <4 x i32> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %A1 = shl
-  ; SSE42: cost of 4 {{.*}} %A1 = shl
-  ; AVX: cost of 4 {{.*}} %A1 = shl
-  ; AVX2: cost of 1 {{.*}} %A1 = shl
-  %A1 = shl <2 x i64> undef, undef
-  ; SSSE3: cost of 20 {{.*}} %A2 = shl
-  ; SSE42: cost of 20 {{.*}} %A2 = shl
-  ; AVX: cost of 20 {{.*}} %A2 = shl
-  ; AVX2: cost of 1 {{.*}} %A2 = shl
-  %A2 = shl <8 x i32> undef, undef
-  ; SSSE3: cost of 8 {{.*}} %A3 = shl
-  ; SSE42: cost of 8 {{.*}} %A3 = shl
-  ; AVX: cost of 8 {{.*}} %A3 = shl
-  ; AVX2: cost of 1 {{.*}} %A3 = shl
-  %A3 = shl <4 x i64> undef, undef
-
-  ; SSSE3: cost of 16 {{.*}} %B0 = lshr
-  ; SSE42: cost of 16 {{.*}} %B0 = lshr
-  ; AVX: cost of 16 {{.*}} %B0 = lshr
-  ; AVX2: cost of 1 {{.*}} %B0 = lshr
-  %B0 = lshr <4 x i32> undef, undef
-  ; SSSE3: cost of 4 {{.*}} %B1 = lshr
-  ; SSE42: cost of 4 {{.*}} %B1 = lshr
-  ; AVX: cost of 4 {{.*}} %B1 = lshr
-  ; AVX2: cost of 1 {{.*}} %B1 = lshr
-  %B1 = lshr <2 x i64> undef, undef
-  ; SSSE3: cost of 32 {{.*}} %B2 = lshr
-  ; SSE42: cost of 32 {{.*}} %B2 = lshr
-  ; AVX: cost of 32 {{.*}} %B2 = lshr
-  ; AVX2: cost of 1 {{.*}} %B2 = lshr
-  %B2 = lshr <8 x i32> undef, undef
-  ; SSSE3: cost of 8 {{.*}} %B3 = lshr
-  ; SSE42: cost of 8 {{.*}} %B3 = lshr
-  ; AVX: cost of 8 {{.*}} %B3 = lshr
-  ; AVX2: cost of 1 {{.*}} %B3 = lshr
-  %B3 = lshr <4 x i64> undef, undef
-
-  ; SSSE3: cost of 16 {{.*}} %C0 = ashr
-  ; SSE42: cost of 16 {{.*}} %C0 = ashr
-  ; AVX: cost of 16 {{.*}} %C0 = ashr
-  ; AVX2: cost of 1 {{.*}} %C0 = ashr
-  %C0 = ashr <4 x i32> undef, undef
-  ; SSSE3: cost of 12 {{.*}} %C1 = ashr
-  ; SSE42: cost of 12 {{.*}} %C1 = ashr
-  ; AVX: cost of 12 {{.*}} %C1 = ashr
-  ; AVX2: cost of 4 {{.*}} %C1 = ashr
-  %C1 = ashr <2 x i64> undef, undef
-  ; SSSE3: cost of 32 {{.*}} %C2 = ashr
-  ; SSE42: cost of 32 {{.*}} %C2 = ashr
-  ; AVX: cost of 32 {{.*}} %C2 = ashr
-  ; AVX2: cost of 1 {{.*}} %C2 = ashr
-  %C2 = ashr <8 x i32> undef, undef
-  ; SSSE3: cost of 24 {{.*}} %C3 = ashr
-  ; SSE42: cost of 24 {{.*}} %C3 = ashr
-  ; AVX: cost of 24 {{.*}} %C3 = ashr
-  ; AVX2: cost of 4 {{.*}} %C3 = ashr
-  %C3 = ashr <4 x i64> undef, undef
+; CHECK-LABEL: 'mul_2i32'
+define void @mul_2i32() {
+  ; A <2 x i32> gets expanded to a <2 x i64> vector.
+  ; A <2 x i64> vector multiply is implemented using
+  ; 3 PMULUDQ and 2 PADDS and 4 shifts.
+  ; SSSE3: cost of 8 {{.*}} %A0 = mul
+  ; SSE42: cost of 8 {{.*}} %A0 = mul
+  ; AVX: cost of 8 {{.*}} %A0 = mul
+  ; AVX2: cost of 8 {{.*}} %A0 = mul
+  ; AVX512F: cost of 8 {{.*}} %A0 = mul
+  ; AVX512BW: cost of 8 {{.*}} %A0 = mul
+  ; AVX512DQ: cost of 1 {{.*}} %A0 = mul
+  %A0 = mul <2 x i32> undef, undef
 
   ret void
 }
diff --git a/test/Analysis/CostModel/X86/ctbits-cost.ll b/test/Analysis/CostModel/X86/ctbits-cost.ll
index 23bfafd8bc94..8c7fa9d73151 100644
--- a/test/Analysis/CostModel/X86/ctbits-cost.ll
+++ b/test/Analysis/CostModel/X86/ctbits-cost.ll
@@ -2,8 +2,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE42 -check-prefix=POPCNT
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX1 -check-prefix=POPCNT
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1 -check-prefix=POPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2 -check-prefix=POPCNT
 
 ; Verify the cost of scalar population count instructions.
 
@@ -58,72 +58,76 @@ declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
 
 define <2 x i64> @var_ctpop_v2i64(<2 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v2i64':
-; SSE: Found an estimated cost of 2 for instruction:   %ctpop
-; AVX: Found an estimated cost of 2 for instruction:   %ctpop
-; XOP: Found an estimated cost of 2 for instruction:   %ctpop
+; SSE2: Found an estimated cost of 12 for instruction:   %ctpop
+; SSE42: Found an estimated cost of 7 for instruction:   %ctpop
+; AVX: Found an estimated cost of 7 for instruction:   %ctpop
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   ret <2 x i64> %ctpop
 }
 
 define <4 x i64> @var_ctpop_v4i64(<4 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v4i64':
-; SSE: Found an estimated cost of 4 for instruction:   %ctpop
-; AVX: Found an estimated cost of 2 for instruction:   %ctpop
-; XOP: Found an estimated cost of 2 for instruction:   %ctpop
+; SSE2: Found an estimated cost of 24 for instruction:   %ctpop
+; SSE42: Found an estimated cost of 14 for instruction:   %ctpop
+; AVX1: Found an estimated cost of 14 for instruction:   %ctpop
+; AVX2: Found an estimated cost of 7 for instruction:   %ctpop
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
   ret <4 x i64> %ctpop
 }
 
 define <4 x i32> @var_ctpop_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v4i32':
-; SSE: Found an estimated cost of 2 for instruction:   %ctpop
-; AVX: Found an estimated cost of 2 for instruction:   %ctpop
-; XOP: Found an estimated cost of 2 for instruction:   %ctpop
+; SSE2: Found an estimated cost of 15 for instruction:   %ctpop
+; SSE42: Found an estimated cost of 11 for instruction:   %ctpop
+; AVX: Found an estimated cost of 11 for instruction:   %ctpop
   %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
   ret <4 x i32> %ctpop
 }
 
 define <8 x i32> @var_ctpop_v8i32(<8 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v8i32':
-; SSE: Found an estimated cost of 4 for instruction:   %ctpop
-; AVX: Found an estimated cost of 2 for instruction:   %ctpop
-; XOP: Found an estimated cost of 2 for instruction:   %ctpop
+; SSE2: Found an estimated cost of 30 for instruction:   %ctpop
+; SSE42: Found an estimated cost of 22 for instruction:   %ctpop
+; AVX1: Found an estimated cost of 22 for instruction:   %ctpop
+; AVX2: Found an estimated cost of 11 for instruction:   %ctpop
   %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
   ret <8 x i32> %ctpop
 }
 
 define <8 x i16> @var_ctpop_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v8i16':
-; SSE: Found an estimated cost of 2 for instruction:   %ctpop
-; AVX: Found an estimated cost of 2 for instruction:   %ctpop
-; XOP: Found an estimated cost of 2 for instruction:   %ctpop
+; SSE2: Found an estimated cost of 13 for instruction:   %ctpop
+; SSE42: Found an estimated cost of 9 for instruction:   %ctpop
+; AVX: Found an estimated cost of 9 for instruction:   %ctpop
   %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
   ret <8 x i16> %ctpop
 }
 
 define <16 x i16> @var_ctpop_v16i16(<16 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v16i16':
-; SSE: Found an estimated cost of 4 for instruction:   %ctpop
-; AVX: Found an estimated cost of 2 for instruction:   %ctpop
-; XOP: Found an estimated cost of 2 for instruction:   %ctpop
+; SSE2: Found an estimated cost of 26 for instruction:   %ctpop
+; SSE42: Found an estimated cost of 18 for instruction:   %ctpop
+; AVX1: Found an estimated cost of 18 for instruction:   %ctpop
+; AVX2: Found an estimated cost of 9 for instruction:   %ctpop
   %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
   ret <16 x i16> %ctpop
 }
 
 define <16 x i8> @var_ctpop_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v16i8':
-; SSE: Found an estimated cost of 2 for instruction:   %ctpop
-; AVX: Found an estimated cost of 2 for instruction:   %ctpop
-; XOP: Found an estimated cost of 2 for instruction:   %ctpop
+; SSE2: Found an estimated cost of 10 for instruction:   %ctpop
+; SSE42: Found an estimated cost of 6 for instruction:   %ctpop
+; AVX: Found an estimated cost of 6 for instruction:   %ctpop
   %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
   ret <16 x i8> %ctpop
 }
 
 define <32 x i8> @var_ctpop_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctpop_v32i8':
-; SSE: Found an estimated cost of 4 for instruction:   %ctpop
-; AVX: Found an estimated cost of 2 for instruction:   %ctpop
-; XOP: Found an estimated cost of 2 for instruction:   %ctpop
+; SSE2: Found an estimated cost of 20 for instruction:   %ctpop
+; SSE42: Found an estimated cost of 12 for instruction:   %ctpop
+; AVX1: Found an estimated cost of 12 for instruction:   %ctpop
+; AVX2: Found an estimated cost of 6 for instruction:   %ctpop
   %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
   ret <32 x i8> %ctpop
 }
@@ -205,144 +209,152 @@ declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>, i1)
 
 define <2 x i64> @var_ctlz_v2i64(<2 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v2i64':
-; SSE: Found an estimated cost of 6 for instruction:   %ctlz
-; AVX: Found an estimated cost of 6 for instruction:   %ctlz
-; XOP: Found an estimated cost of 6 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 25 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 23 for instruction:   %ctlz
+; AVX: Found an estimated cost of 23 for instruction:   %ctlz
   %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 0)
   ret <2 x i64> %ctlz
 }
 
 define <2 x i64> @var_ctlz_v2i64u(<2 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v2i64u':
-; SSE: Found an estimated cost of 6 for instruction:   %ctlz
-; AVX: Found an estimated cost of 6 for instruction:   %ctlz
-; XOP: Found an estimated cost of 6 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 25 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 23 for instruction:   %ctlz
+; AVX: Found an estimated cost of 23 for instruction:   %ctlz
   %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 1)
   ret <2 x i64> %ctlz
 }
 
 define <4 x i64> @var_ctlz_v4i64(<4 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i64':
-; SSE: Found an estimated cost of 12 for instruction:   %ctlz
-; AVX: Found an estimated cost of 12 for instruction:   %ctlz
-; XOP: Found an estimated cost of 12 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 50 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 46 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 46 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 23 for instruction:   %ctlz
   %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 0)
   ret <4 x i64> %ctlz
 }
 
 define <4 x i64> @var_ctlz_v4i64u(<4 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i64u':
-; SSE: Found an estimated cost of 12 for instruction:   %ctlz
-; AVX: Found an estimated cost of 12 for instruction:   %ctlz
-; XOP: Found an estimated cost of 12 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 50 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 46 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 46 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 23 for instruction:   %ctlz
   %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 1)
   ret <4 x i64> %ctlz
 }
 
 define <4 x i32> @var_ctlz_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i32':
-; SSE: Found an estimated cost of 12 for instruction:   %ctlz
-; AVX: Found an estimated cost of 12 for instruction:   %ctlz
-; XOP: Found an estimated cost of 12 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 26 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX: Found an estimated cost of 18 for instruction:   %ctlz
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 0)
   ret <4 x i32> %ctlz
 }
 
 define <4 x i32> @var_ctlz_v4i32u(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v4i32u':
-; SSE: Found an estimated cost of 12 for instruction:   %ctlz
-; AVX: Found an estimated cost of 12 for instruction:   %ctlz
-; XOP: Found an estimated cost of 12 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 26 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX: Found an estimated cost of 18 for instruction:   %ctlz
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 1)
   ret <4 x i32> %ctlz
 }
 
 define <8 x i32> @var_ctlz_v8i32(<8 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i32':
-; SSE: Found an estimated cost of 24 for instruction:   %ctlz
-; AVX: Found an estimated cost of 24 for instruction:   %ctlz
-; XOP: Found an estimated cost of 24 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 52 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 36 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 36 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 18 for instruction:   %ctlz
   %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 0)
   ret <8 x i32> %ctlz
 }
 
 define <8 x i32> @var_ctlz_v8i32u(<8 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i32u':
-; SSE: Found an estimated cost of 24 for instruction:   %ctlz
-; AVX: Found an estimated cost of 24 for instruction:   %ctlz
-; XOP: Found an estimated cost of 24 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 52 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 36 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 36 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 18 for instruction:   %ctlz
   %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 1)
   ret <8 x i32> %ctlz
 }
 
 define <8 x i16> @var_ctlz_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i16':
-; SSE: Found an estimated cost of 24 for instruction:   %ctlz
-; AVX: Found an estimated cost of 24 for instruction:   %ctlz
-; XOP: Found an estimated cost of 24 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 20 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 14 for instruction:   %ctlz
+; AVX: Found an estimated cost of 14 for instruction:   %ctlz
   %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 0)
   ret <8 x i16> %ctlz
 }
 
 define <8 x i16> @var_ctlz_v8i16u(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v8i16u':
-; SSE: Found an estimated cost of 24 for instruction:   %ctlz
-; AVX: Found an estimated cost of 24 for instruction:   %ctlz
-; XOP: Found an estimated cost of 24 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 20 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 14 for instruction:   %ctlz
+; AVX: Found an estimated cost of 14 for instruction:   %ctlz
   %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 1)
   ret <8 x i16> %ctlz
 }
 
 define <16 x i16> @var_ctlz_v16i16(<16 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i16':
-; SSE: Found an estimated cost of 48 for instruction:   %ctlz
-; AVX: Found an estimated cost of 48 for instruction:   %ctlz
-; XOP: Found an estimated cost of 48 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 40 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 28 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 28 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 14 for instruction:   %ctlz
   %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 0)
   ret <16 x i16> %ctlz
 }
 
 define <16 x i16> @var_ctlz_v16i16u(<16 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i16u':
-; SSE: Found an estimated cost of 48 for instruction:   %ctlz
-; AVX: Found an estimated cost of 48 for instruction:   %ctlz
-; XOP: Found an estimated cost of 48 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 40 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 28 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 28 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 14 for instruction:   %ctlz
   %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 1)
   ret <16 x i16> %ctlz
 }
 
 define <16 x i8> @var_ctlz_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i8':
-; SSE: Found an estimated cost of 48 for instruction:   %ctlz
-; AVX: Found an estimated cost of 48 for instruction:   %ctlz
-; XOP: Found an estimated cost of 48 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 17 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 9 for instruction:   %ctlz
+; AVX: Found an estimated cost of 9 for instruction:   %ctlz
   %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 0)
   ret <16 x i8> %ctlz
 }
 
 define <16 x i8> @var_ctlz_v16i8u(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v16i8u':
-; SSE: Found an estimated cost of 48 for instruction:   %ctlz
-; AVX: Found an estimated cost of 48 for instruction:   %ctlz
-; XOP: Found an estimated cost of 48 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 17 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 9 for instruction:   %ctlz
+; AVX: Found an estimated cost of 9 for instruction:   %ctlz
   %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 1)
   ret <16 x i8> %ctlz
 }
 
 define <32 x i8> @var_ctlz_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v32i8':
-; SSE: Found an estimated cost of 96 for instruction:   %ctlz
-; AVX: Found an estimated cost of 96 for instruction:   %ctlz
-; XOP: Found an estimated cost of 96 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 34 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 9 for instruction:   %ctlz
   %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 0)
   ret <32 x i8> %ctlz
 }
 
 define <32 x i8> @var_ctlz_v32i8u(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_ctlz_v32i8u':
-; SSE: Found an estimated cost of 96 for instruction:   %ctlz
-; AVX: Found an estimated cost of 96 for instruction:   %ctlz
-; XOP: Found an estimated cost of 96 for instruction:   %ctlz
+; SSE2: Found an estimated cost of 34 for instruction:   %ctlz
+; SSE42: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX1: Found an estimated cost of 18 for instruction:   %ctlz
+; AVX2: Found an estimated cost of 9 for instruction:   %ctlz
   %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 1)
   ret <32 x i8> %ctlz
 }
@@ -424,144 +436,152 @@ declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1)
 
 define <2 x i64> @var_cttz_v2i64(<2 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v2i64':
-; SSE: Found an estimated cost of 6 for instruction:   %cttz
-; AVX: Found an estimated cost of 6 for instruction:   %cttz
-; XOP: Found an estimated cost of 6 for instruction:   %cttz
+; SSE2: Found an estimated cost of 14 for instruction:   %cttz
+; SSE42: Found an estimated cost of 10 for instruction:   %cttz
+; AVX: Found an estimated cost of 10 for instruction:   %cttz
   %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 0)
   ret <2 x i64> %cttz
 }
 
 define <2 x i64> @var_cttz_v2i64u(<2 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v2i64u':
-; SSE: Found an estimated cost of 6 for instruction:   %cttz
-; AVX: Found an estimated cost of 6 for instruction:   %cttz
-; XOP: Found an estimated cost of 6 for instruction:   %cttz
+; SSE2: Found an estimated cost of 14 for instruction:   %cttz
+; SSE42: Found an estimated cost of 10 for instruction:   %cttz
+; AVX: Found an estimated cost of 10 for instruction:   %cttz
   %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 1)
   ret <2 x i64> %cttz
 }
 
 define <4 x i64> @var_cttz_v4i64(<4 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v4i64':
-; SSE: Found an estimated cost of 12 for instruction:   %cttz
-; AVX: Found an estimated cost of 12 for instruction:   %cttz
-; XOP: Found an estimated cost of 12 for instruction:   %cttz
+; SSE2: Found an estimated cost of 28 for instruction:   %cttz
+; SSE42: Found an estimated cost of 20 for instruction:   %cttz
+; AVX1: Found an estimated cost of 20 for instruction:   %cttz
+; AVX2: Found an estimated cost of 10 for instruction:   %cttz
   %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 0)
   ret <4 x i64> %cttz
 }
 
 define <4 x i64> @var_cttz_v4i64u(<4 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v4i64u':
-; SSE: Found an estimated cost of 12 for instruction:   %cttz
-; AVX: Found an estimated cost of 12 for instruction:   %cttz
-; XOP: Found an estimated cost of 12 for instruction:   %cttz
+; SSE2: Found an estimated cost of 28 for instruction:   %cttz
+; SSE42: Found an estimated cost of 20 for instruction:   %cttz
+; AVX1: Found an estimated cost of 20 for instruction:   %cttz
+; AVX2: Found an estimated cost of 10 for instruction:   %cttz
   %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 1)
   ret <4 x i64> %cttz
 }
 
 define <4 x i32> @var_cttz_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v4i32':
-; SSE: Found an estimated cost of 12 for instruction:   %cttz
-; AVX: Found an estimated cost of 12 for instruction:   %cttz
-; XOP: Found an estimated cost of 12 for instruction:   %cttz
+; SSE2: Found an estimated cost of 18 for instruction:   %cttz
+; SSE42: Found an estimated cost of 14 for instruction:   %cttz
+; AVX: Found an estimated cost of 14 for instruction:   %cttz
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 0)
   ret <4 x i32> %cttz
 }
 
 define <4 x i32> @var_cttz_v4i32u(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v4i32u':
-; SSE: Found an estimated cost of 12 for instruction:   %cttz
-; AVX: Found an estimated cost of 12 for instruction:   %cttz
-; XOP: Found an estimated cost of 12 for instruction:   %cttz
+; SSE2: Found an estimated cost of 18 for instruction:   %cttz
+; SSE42: Found an estimated cost of 14 for instruction:   %cttz
+; AVX: Found an estimated cost of 14 for instruction:   %cttz
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 1)
   ret <4 x i32> %cttz
 }
 
 define <8 x i32> @var_cttz_v8i32(<8 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i32':
-; SSE: Found an estimated cost of 24 for instruction:   %cttz
-; AVX: Found an estimated cost of 24 for instruction:   %cttz
-; XOP: Found an estimated cost of 24 for instruction:   %cttz
+; SSE2: Found an estimated cost of 36 for instruction:   %cttz
+; SSE42: Found an estimated cost of 28 for instruction:   %cttz
+; AVX1: Found an estimated cost of 28 for instruction:   %cttz
+; AVX2: Found an estimated cost of 14 for instruction:   %cttz
   %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 0)
   ret <8 x i32> %cttz
 }
 
 define <8 x i32> @var_cttz_v8i32u(<8 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i32u':
-; SSE: Found an estimated cost of 24 for instruction:   %cttz
-; AVX: Found an estimated cost of 24 for instruction:   %cttz
-; XOP: Found an estimated cost of 24 for instruction:   %cttz
+; SSE2: Found an estimated cost of 36 for instruction:   %cttz
+; SSE42: Found an estimated cost of 28 for instruction:   %cttz
+; AVX1: Found an estimated cost of 28 for instruction:   %cttz
+; AVX2: Found an estimated cost of 14 for instruction:   %cttz
   %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 1)
   ret <8 x i32> %cttz
 }
 
 define <8 x i16> @var_cttz_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i16':
-; SSE: Found an estimated cost of 24 for instruction:   %cttz
-; AVX: Found an estimated cost of 24 for instruction:   %cttz
-; XOP: Found an estimated cost of 24 for instruction:   %cttz
+; SSE2: Found an estimated cost of 16 for instruction:   %cttz
+; SSE42: Found an estimated cost of 12 for instruction:   %cttz
+; AVX: Found an estimated cost of 12 for instruction:   %cttz
   %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 0)
   ret <8 x i16> %cttz
 }
 
 define <8 x i16> @var_cttz_v8i16u(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v8i16u':
-; SSE: Found an estimated cost of 24 for instruction:   %cttz
-; AVX: Found an estimated cost of 24 for instruction:   %cttz
-; XOP: Found an estimated cost of 24 for instruction:   %cttz
+; SSE2: Found an estimated cost of 16 for instruction:   %cttz
+; SSE42: Found an estimated cost of 12 for instruction:   %cttz
+; AVX: Found an estimated cost of 12 for instruction:   %cttz
   %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 1)
   ret <8 x i16> %cttz
 }
 
 define <16 x i16> @var_cttz_v16i16(<16 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i16':
-; SSE: Found an estimated cost of 48 for instruction:   %cttz
-; AVX: Found an estimated cost of 48 for instruction:   %cttz
-; XOP: Found an estimated cost of 48 for instruction:   %cttz
+; SSE2: Found an estimated cost of 32 for instruction:   %cttz
+; SSE42: Found an estimated cost of 24 for instruction:   %cttz
+; AVX1: Found an estimated cost of 24 for instruction:   %cttz
+; AVX2: Found an estimated cost of 12 for instruction:   %cttz
   %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 0)
   ret <16 x i16> %cttz
 }
 
 define <16 x i16> @var_cttz_v16i16u(<16 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i16u':
-; SSE: Found an estimated cost of 48 for instruction:   %cttz
-; AVX: Found an estimated cost of 48 for instruction:   %cttz
-; XOP: Found an estimated cost of 48 for instruction:   %cttz
+; SSE2: Found an estimated cost of 32 for instruction:   %cttz
+; SSE42: Found an estimated cost of 24 for instruction:   %cttz
+; AVX1: Found an estimated cost of 24 for instruction:   %cttz
+; AVX2: Found an estimated cost of 12 for instruction:   %cttz
   %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 1)
   ret <16 x i16> %cttz
 }
 
 define <16 x i8> @var_cttz_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i8':
-; SSE: Found an estimated cost of 48 for instruction:   %cttz
-; AVX: Found an estimated cost of 48 for instruction:   %cttz
-; XOP: Found an estimated cost of 48 for instruction:   %cttz
+; SSE2: Found an estimated cost of 13 for instruction:   %cttz
+; SSE42: Found an estimated cost of 9 for instruction:   %cttz
+; AVX: Found an estimated cost of 9 for instruction:   %cttz
   %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 0)
   ret <16 x i8> %cttz
 }
 
 define <16 x i8> @var_cttz_v16i8u(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v16i8u':
-; SSE: Found an estimated cost of 48 for instruction:   %cttz
-; AVX: Found an estimated cost of 48 for instruction:   %cttz
-; XOP: Found an estimated cost of 48 for instruction:   %cttz
+; SSE2: Found an estimated cost of 13 for instruction:   %cttz
+; SSE42: Found an estimated cost of 9 for instruction:   %cttz
+; AVX: Found an estimated cost of 9 for instruction:   %cttz
   %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 1)
   ret <16 x i8> %cttz
 }
 
 define <32 x i8> @var_cttz_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v32i8':
-; SSE: Found an estimated cost of 96 for instruction:   %cttz
-; AVX: Found an estimated cost of 96 for instruction:   %cttz
-; XOP: Found an estimated cost of 96 for instruction:   %cttz
+; SSE2: Found an estimated cost of 26 for instruction:   %cttz
+; SSE42: Found an estimated cost of 18 for instruction:   %cttz
+; AVX1: Found an estimated cost of 18 for instruction:   %cttz
+; AVX2: Found an estimated cost of 9 for instruction:   %cttz
   %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 0)
   ret <32 x i8> %cttz
 }
 
 define <32 x i8> @var_cttz_v32i8u(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_cttz_v32i8u':
-; SSE: Found an estimated cost of 96 for instruction:   %cttz
-; AVX: Found an estimated cost of 96 for instruction:   %cttz
-; XOP: Found an estimated cost of 96 for instruction:   %cttz
+; SSE2: Found an estimated cost of 26 for instruction:   %cttz
+; SSE42: Found an estimated cost of 18 for instruction:   %cttz
+; AVX1: Found an estimated cost of 18 for instruction:   %cttz
+; AVX2: Found an estimated cost of 9 for instruction:   %cttz
   %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 1)
   ret <32 x i8> %cttz
 }
diff --git a/test/Analysis/CostModel/X86/div.ll b/test/Analysis/CostModel/X86/div.ll
index c7d6517c7f03..0ac06ff75ebe 100644
--- a/test/Analysis/CostModel/X86/div.ll
+++ b/test/Analysis/CostModel/X86/div.ll
@@ -1,32 +1,376 @@
-; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s
-; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck --check-prefix=AVX2 %s
-
-
-define void @div_sse() {
-  ; SSE2: div_sse
-  ; SSE2: cost of 320 {{.*}} sdiv
-  %a0 = sdiv <16 x i8> undef, undef
-  ; SSE2: cost of 160 {{.*}} sdiv
-  %a1 = sdiv <8 x i16> undef, undef
-  ; SSE2: cost of 80 {{.*}} sdiv
-  %a2 = sdiv <4 x i32> undef, undef
-  ; SSE2: cost of 40 {{.*}} sdiv
-  %a3 = sdiv <2 x i32> undef, undef
-  ret void
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSSE3
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE42
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: 'sdiv'
+define i32 @sdiv() {
+  ; CHECK: cost of 1 {{.*}} %I64 = sdiv
+  %I64 = sdiv i64 undef, undef
+  ; SSE: cost of 40 {{.*}} %V2i64 = sdiv
+  ; AVX: cost of 40 {{.*}} %V2i64 = sdiv
+  %V2i64 = sdiv <2 x i64> undef, undef
+  ; SSE: cost of 80 {{.*}} %V4i64 = sdiv
+  ; AVX: cost of 80 {{.*}} %V4i64 = sdiv
+  %V4i64 = sdiv <4 x i64> undef, undef
+  ; SSE: cost of 160 {{.*}} %V8i64 = sdiv
+  ; AVX: cost of 160 {{.*}} %V8i64 = sdiv
+  %V8i64 = sdiv <8 x i64> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I32 = sdiv
+  %I32 = sdiv i32 undef, undef
+  ; SSE: cost of 80 {{.*}} %V4i32 = sdiv
+  ; AVX: cost of 80 {{.*}} %V4i32 = sdiv
+  %V4i32 = sdiv <4 x i32> undef, undef
+  ; SSE: cost of 160 {{.*}} %V8i32 = sdiv
+  ; AVX: cost of 160 {{.*}} %V8i32 = sdiv
+  %V8i32 = sdiv <8 x i32> undef, undef
+  ; SSE: cost of 320 {{.*}} %V16i32 = sdiv
+  ; AVX: cost of 320 {{.*}} %V16i32 = sdiv
+  %V16i32 = sdiv <16 x i32> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I16 = sdiv
+  %I16 = sdiv i16 undef, undef
+  ; SSE: cost of 160 {{.*}} %V8i16 = sdiv
+  ; AVX: cost of 160 {{.*}} %V8i16 = sdiv
+  %V8i16 = sdiv <8 x i16> undef, undef
+  ; SSE: cost of 320 {{.*}} %V16i16 = sdiv
+  ; AVX: cost of 320 {{.*}} %V16i16 = sdiv
+  %V16i16 = sdiv <16 x i16> undef, undef
+  ; SSE: cost of 640 {{.*}} %V32i16 = sdiv
+  ; AVX: cost of 640 {{.*}} %V32i16 = sdiv
+  %V32i16 = sdiv <32 x i16> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I8 = sdiv
+  %I8 = sdiv i8 undef, undef
+  ; SSE: cost of 320 {{.*}} %V16i8 = sdiv
+  ; AVX: cost of 320 {{.*}} %V16i8 = sdiv
+  %V16i8 = sdiv <16 x i8> undef, undef
+  ; SSE: cost of 640 {{.*}} %V32i8 = sdiv
+  ; AVX: cost of 640 {{.*}} %V32i8 = sdiv
+  %V32i8 = sdiv <32 x i8> undef, undef
+  ; SSE: cost of 1280 {{.*}} %V64i8 = sdiv
+  ; AVX: cost of 1280 {{.*}} %V64i8 = sdiv
+  %V64i8 = sdiv <64 x i8> undef, undef
+
+  ret i32 undef
 }
-; SSE2: div_avx
-
-define void @div_avx() {
-  ; AVX2: div_avx
-  ; AVX2: cost of 640 {{.*}} sdiv
-  %a0 = sdiv <32 x i8> undef, undef
-  ; AVX2: cost of 320 {{.*}} sdiv
-  %a1 = sdiv <16 x i16> undef, undef
-  ; AVX2: cost of 160 {{.*}} sdiv
-  %a2 = sdiv <8 x i32> undef, undef
-  ; AVX2: cost of 80 {{.*}} sdiv
-  %a3 = sdiv <4 x i32> undef, undef
-  ret void
+
+; CHECK-LABEL: 'udiv'
+define i32 @udiv() {
+  ; CHECK: cost of 1 {{.*}} %I64 = udiv
+  %I64 = udiv i64 undef, undef
+  ; SSE: cost of 40 {{.*}} %V2i64 = udiv
+  ; AVX: cost of 40 {{.*}} %V2i64 = udiv
+  %V2i64 = udiv <2 x i64> undef, undef
+  ; SSE: cost of 80 {{.*}} %V4i64 = udiv
+  ; AVX: cost of 80 {{.*}} %V4i64 = udiv
+  %V4i64 = udiv <4 x i64> undef, undef
+  ; SSE: cost of 160 {{.*}} %V8i64 = udiv
+  ; AVX: cost of 160 {{.*}} %V8i64 = udiv
+  %V8i64 = udiv <8 x i64> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I32 = udiv
+  %I32 = udiv i32 undef, undef
+  ; SSE: cost of 80 {{.*}} %V4i32 = udiv
+  ; AVX: cost of 80 {{.*}} %V4i32 = udiv
+  %V4i32 = udiv <4 x i32> undef, undef
+  ; SSE: cost of 160 {{.*}} %V8i32 = udiv
+  ; AVX: cost of 160 {{.*}} %V8i32 = udiv
+  %V8i32 = udiv <8 x i32> undef, undef
+  ; SSE: cost of 320 {{.*}} %V16i32 = udiv
+  ; AVX: cost of 320 {{.*}} %V16i32 = udiv
+  %V16i32 = udiv <16 x i32> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I16 = udiv
+  %I16 = udiv i16 undef, undef
+  ; SSE: cost of 160 {{.*}} %V8i16 = udiv
+  ; AVX: cost of 160 {{.*}} %V8i16 = udiv
+  %V8i16 = udiv <8 x i16> undef, undef
+  ; SSE: cost of 320 {{.*}} %V16i16 = udiv
+  ; AVX: cost of 320 {{.*}} %V16i16 = udiv
+  %V16i16 = udiv <16 x i16> undef, undef
+  ; SSE: cost of 640 {{.*}} %V32i16 = udiv
+  ; AVX: cost of 640 {{.*}} %V32i16 = udiv
+  %V32i16 = udiv <32 x i16> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I8 = udiv
+  %I8 = udiv i8 undef, undef
+  ; SSE: cost of 320 {{.*}} %V16i8 = udiv
+  ; AVX: cost of 320 {{.*}} %V16i8 = udiv
+  %V16i8 = udiv <16 x i8> undef, undef
+  ; SSE: cost of 640 {{.*}} %V32i8 = udiv
+  ; AVX: cost of 640 {{.*}} %V32i8 = udiv
+  %V32i8 = udiv <32 x i8> undef, undef
+  ; SSE: cost of 1280 {{.*}} %V64i8 = udiv
+  ; AVX: cost of 1280 {{.*}} %V64i8 = udiv
+  %V64i8 = udiv <64 x i8> undef, undef
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'sdiv_uniformconst'
+define i32 @sdiv_uniformconst() {
+  ; CHECK: cost of 1 {{.*}} %I64 = sdiv
+  %I64 = sdiv i64 undef, 7
+  ; SSE: cost of 40 {{.*}} %V2i64 = sdiv
+  ; AVX: cost of 40 {{.*}} %V2i64 = sdiv
+  %V2i64 = sdiv <2 x i64> undef, <i64 7, i64 7>
+  ; SSE: cost of 80 {{.*}} %V4i64 = sdiv
+  ; AVX: cost of 80 {{.*}} %V4i64 = sdiv
+  %V4i64 = sdiv <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
+  ; SSE: cost of 160 {{.*}} %V8i64 = sdiv
+  ; AVX: cost of 160 {{.*}} %V8i64 = sdiv
+  %V8i64 = sdiv <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+
+  ; CHECK: cost of 1 {{.*}} %I32 = sdiv
+  %I32 = sdiv i32 undef, 7
+  ; SSE2: cost of 19 {{.*}} %V4i32 = sdiv
+  ; SSSE3: cost of 19 {{.*}} %V4i32 = sdiv
+  ; SSE42: cost of 15 {{.*}} %V4i32 = sdiv
+  ; AVX: cost of 15 {{.*}} %V4i32 = sdiv
+  %V4i32 = sdiv <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
+  ; SSE2: cost of 38 {{.*}} %V8i32 = sdiv
+  ; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv
+  ; SSE42: cost of 30 {{.*}} %V8i32 = sdiv
+  ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv
+  ; AVX2: cost of 15 {{.*}} %V8i32 = sdiv
+  ; AVX512: cost of 15 {{.*}} %V8i32 = sdiv
+  %V8i32 = sdiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  ; SSE2: cost of 76 {{.*}} %V16i32 = sdiv
+  ; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv
+  ; SSE42: cost of 60 {{.*}} %V16i32 = sdiv
+  ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv
+  ; AVX2: cost of 30 {{.*}} %V16i32 = sdiv
+  ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv
+  %V16i32 = sdiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+
+  ; CHECK: cost of 1 {{.*}} %I16 = sdiv
+  %I16 = sdiv i16 undef, 7
+  ; SSE: cost of 6 {{.*}} %V8i16 = sdiv
+  ; AVX: cost of 6 {{.*}} %V8i16 = sdiv
+  %V8i16 = sdiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ; SSE: cost of 12 {{.*}} %V16i16 = sdiv
+  ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv
+  ; AVX2: cost of 6 {{.*}} %V16i16 = sdiv
+  ; AVX512: cost of 6 {{.*}} %V16i16 = sdiv
+  %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ; SSE: cost of 24 {{.*}} %V32i16 = sdiv
+  ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv
+  ; AVX2: cost of 12 {{.*}} %V32i16 = sdiv
+  ; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv
+  ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv
+  %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+
+  ; CHECK: cost of 1 {{.*}} %I8 = sdiv
+  %I8 = sdiv i8 undef, 7
+  ; SSE: cost of 320 {{.*}} %V16i8 = sdiv
+  ; AVX: cost of 320 {{.*}} %V16i8 = sdiv
+  %V16i8 = sdiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ; SSE: cost of 640 {{.*}} %V32i8 = sdiv
+  ; AVX: cost of 640 {{.*}} %V32i8 = sdiv
+  %V32i8 = sdiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ; SSE: cost of 1280 {{.*}} %V64i8 = sdiv
+  ; AVX: cost of 1280 {{.*}} %V64i8 = sdiv
+  %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+
+  ret i32 undef
 }
 
+; CHECK-LABEL: 'udiv_uniformconst'
+define i32 @udiv_uniformconst() {
+  ; CHECK: cost of 1 {{.*}} %I64 = udiv
+  %I64 = udiv i64 undef, 7
+  ; SSE: cost of 40 {{.*}} %V2i64 = udiv
+  ; AVX: cost of 40 {{.*}} %V2i64 = udiv
+  %V2i64 = udiv <2 x i64> undef, <i64 7, i64 7>
+  ; SSE: cost of 80 {{.*}} %V4i64 = udiv
+  ; AVX: cost of 80 {{.*}} %V4i64 = udiv
+  %V4i64 = udiv <4 x i64> undef, <i64 7, i64 7, i64 7, i64 7>
+  ; SSE: cost of 160 {{.*}} %V8i64 = udiv
+  ; AVX: cost of 160 {{.*}} %V8i64 = udiv
+  %V8i64 = udiv <8 x i64> undef, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
 
+  ; CHECK: cost of 1 {{.*}} %I32 = udiv
+  %I32 = udiv i32 undef, 7
+  ; SSE: cost of 15 {{.*}} %V4i32 = udiv
+  ; AVX: cost of 15 {{.*}} %V4i32 = udiv
+  %V4i32 = udiv <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
+  ; SSE: cost of 30 {{.*}} %V8i32 = udiv
+  ; AVX1: cost of 30 {{.*}} %V8i32 = udiv
+  ; AVX2: cost of 15 {{.*}} %V8i32 = udiv
+  ; AVX512: cost of 15 {{.*}} %V8i32 = udiv
+  %V8i32 = udiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  ; SSE: cost of 60 {{.*}} %V16i32 = udiv
+  ; AVX1: cost of 60 {{.*}} %V16i32 = udiv
+  ; AVX2: cost of 30 {{.*}} %V16i32 = udiv
+  ; AVX512: cost of 15 {{.*}} %V16i32 = udiv
+  %V16i32 = udiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+
+  ; CHECK: cost of 1 {{.*}} %I16 = udiv
+  %I16 = udiv i16 undef, 7
+  ; SSE: cost of 6 {{.*}} %V8i16 = udiv
+  ; AVX: cost of 6 {{.*}} %V8i16 = udiv
+  %V8i16 = udiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ; SSE: cost of 12 {{.*}} %V16i16 = udiv
+  ; AVX1: cost of 12 {{.*}} %V16i16 = udiv
+  ; AVX2: cost of 6 {{.*}} %V16i16 = udiv
+  ; AVX512: cost of 6 {{.*}} %V16i16 = udiv
+  %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ; SSE: cost of 24 {{.*}} %V32i16 = udiv
+  ; AVX1: cost of 24 {{.*}} %V32i16 = udiv
+  ; AVX2: cost of 12 {{.*}} %V32i16 = udiv
+  ; AVX512F: cost of 12 {{.*}} %V32i16 = udiv
+  ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv
+  %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+
+  ; CHECK: cost of 1 {{.*}} %I8 = udiv
+  %I8 = udiv i8 undef, 7
+  ; SSE: cost of 320 {{.*}} %V16i8 = udiv
+  ; AVX: cost of 320 {{.*}} %V16i8 = udiv
+  %V16i8 = udiv <16 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ; SSE: cost of 640 {{.*}} %V32i8 = udiv
+  ; AVX: cost of 640 {{.*}} %V32i8 = udiv
+  %V32i8 = udiv <32 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ; SSE: cost of 1280 {{.*}} %V64i8 = udiv
+  ; AVX: cost of 1280 {{.*}} %V64i8 = udiv
+  %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'sdiv_uniformconstpow2'
+define i32 @sdiv_uniformconstpow2() {
+  ; CHECK: cost of 1 {{.*}} %I64 = sdiv
+  %I64 = sdiv i64 undef, 16
+  ; SSE: cost of 40 {{.*}} %V2i64 = sdiv
+  ; AVX: cost of 40 {{.*}} %V2i64 = sdiv
+  %V2i64 = sdiv <2 x i64> undef, <i64 16, i64 16>
+  ; SSE: cost of 80 {{.*}} %V4i64 = sdiv
+  ; AVX: cost of 80 {{.*}} %V4i64 = sdiv
+  %V4i64 = sdiv <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+  ; SSE: cost of 160 {{.*}} %V8i64 = sdiv
+  ; AVX: cost of 160 {{.*}} %V8i64 = sdiv
+  %V8i64 = sdiv <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+
+  ; CHECK: cost of 1 {{.*}} %I32 = sdiv
+  %I32 = sdiv i32 undef, 16
+  ; SSE2: cost of 19 {{.*}} %V4i32 = sdiv
+  ; SSSE3: cost of 19 {{.*}} %V4i32 = sdiv
+  ; SSE42: cost of 15 {{.*}} %V4i32 = sdiv
+  ; AVX: cost of 15 {{.*}} %V4i32 = sdiv
+  %V4i32 = sdiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
+  ; SSE2: cost of 38 {{.*}} %V8i32 = sdiv
+  ; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv
+  ; SSE42: cost of 30 {{.*}} %V8i32 = sdiv
+  ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv
+  ; AVX2: cost of 15 {{.*}} %V8i32 = sdiv
+  ; AVX512: cost of 15 {{.*}} %V8i32 = sdiv
+  %V8i32 = sdiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ; SSE2: cost of 76 {{.*}} %V16i32 = sdiv
+  ; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv
+  ; SSE42: cost of 60 {{.*}} %V16i32 = sdiv
+  ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv
+  ; AVX2: cost of 30 {{.*}} %V16i32 = sdiv
+  ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv
+  %V16i32 = sdiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+
+  ; CHECK: cost of 1 {{.*}} %I16 = sdiv
+  %I16 = sdiv i16 undef, 16
+  ; SSE: cost of 6 {{.*}} %V8i16 = sdiv
+  ; AVX: cost of 6 {{.*}} %V8i16 = sdiv
+  %V8i16 = sdiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  ; SSE: cost of 12 {{.*}} %V16i16 = sdiv
+  ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv
+  ; AVX2: cost of 6 {{.*}} %V16i16 = sdiv
+  ; AVX512: cost of 6 {{.*}} %V16i16 = sdiv
+  %V16i16 = sdiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  ; SSE: cost of 24 {{.*}} %V32i16 = sdiv
+  ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv
+  ; AVX2: cost of 12 {{.*}} %V32i16 = sdiv
+  ; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv
+  ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv
+  %V32i16 = sdiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+
+  ; CHECK: cost of 1 {{.*}} %I8 = sdiv
+  %I8 = sdiv i8 undef, 16
+  ; SSE: cost of 320 {{.*}} %V16i8 = sdiv
+  ; AVX: cost of 320 {{.*}} %V16i8 = sdiv
+  %V16i8 = sdiv <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+  ; SSE: cost of 640 {{.*}} %V32i8 = sdiv
+  ; AVX: cost of 640 {{.*}} %V32i8 = sdiv
+  %V32i8 = sdiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+  ; SSE: cost of 1280 {{.*}} %V64i8 = sdiv
+  ; AVX: cost of 1280 {{.*}} %V64i8 = sdiv
+  %V64i8 = sdiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'udiv_uniformconstpow2'
+define i32 @udiv_uniformconstpow2() {
+  ; CHECK: cost of 1 {{.*}} %I64 = udiv
+  %I64 = udiv i64 undef, 16
+  ; SSE: cost of 40 {{.*}} %V2i64 = udiv
+  ; AVX: cost of 40 {{.*}} %V2i64 = udiv
+  %V2i64 = udiv <2 x i64> undef, <i64 16, i64 16>
+  ; SSE: cost of 80 {{.*}} %V4i64 = udiv
+  ; AVX: cost of 80 {{.*}} %V4i64 = udiv
+  %V4i64 = udiv <4 x i64> undef, <i64 16, i64 16, i64 16, i64 16>
+  ; SSE: cost of 160 {{.*}} %V8i64 = udiv
+  ; AVX: cost of 160 {{.*}} %V8i64 = udiv
+  %V8i64 = udiv <8 x i64> undef, <i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16, i64 16>
+
+  ; CHECK: cost of 1 {{.*}} %I32 = udiv
+  %I32 = udiv i32 undef, 16
+  ; SSE: cost of 15 {{.*}} %V4i32 = udiv
+  ; AVX: cost of 15 {{.*}} %V4i32 = udiv
+  %V4i32 = udiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
+  ; SSE: cost of 30 {{.*}} %V8i32 = udiv
+  ; AVX1: cost of 30 {{.*}} %V8i32 = udiv
+  ; AVX2: cost of 15 {{.*}} %V8i32 = udiv
+  ; AVX512: cost of 15 {{.*}} %V8i32 = udiv
+  %V8i32 = udiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  ; SSE: cost of 60 {{.*}} %V16i32 = udiv
+  ; AVX1: cost of 60 {{.*}} %V16i32 = udiv
+  ; AVX2: cost of 30 {{.*}} %V16i32 = udiv
+  ; AVX512: cost of 15 {{.*}} %V16i32 = udiv
+  %V16i32 = udiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+
+  ; CHECK: cost of 1 {{.*}} %I16 = udiv
+  %I16 = udiv i16 undef, 16
+  ; SSE: cost of 6 {{.*}} %V8i16 = udiv
+  ; AVX: cost of 6 {{.*}} %V8i16 = udiv
+  %V8i16 = udiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  ; SSE: cost of 12 {{.*}} %V16i16 = udiv
+  ; AVX1: cost of 12 {{.*}} %V16i16 = udiv
+  ; AVX2: cost of 6 {{.*}} %V16i16 = udiv
+  ; AVX512: cost of 6 {{.*}} %V16i16 = udiv
+  %V16i16 = udiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  ; SSE: cost of 24 {{.*}} %V32i16 = udiv
+  ; AVX1: cost of 24 {{.*}} %V32i16 = udiv
+  ; AVX2: cost of 12 {{.*}} %V32i16 = udiv
+  ; AVX512F: cost of 12 {{.*}} %V32i16 = udiv
+  ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv
+  %V32i16 = udiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+
+  ; CHECK: cost of 1 {{.*}} %I8 = udiv
+  %I8 = udiv i8 undef, 16
+  ; SSE: cost of 320 {{.*}} %V16i8 = udiv
+  ; AVX: cost of 320 {{.*}} %V16i8 = udiv
+  %V16i8 = udiv <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+  ; SSE: cost of 640 {{.*}} %V32i8 = udiv
+  ; AVX: cost of 640 {{.*}} %V32i8 = udiv
+  %V32i8 = udiv <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+  ; SSE: cost of 1280 {{.*}} %V64i8 = udiv
+  ; AVX: cost of 1280 {{.*}} %V64i8 = udiv
+  %V64i8 = udiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
+
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/fptosi.ll b/test/Analysis/CostModel/X86/fptosi.ll
new file mode 100644
index 000000000000..d5e21f8685a7
--- /dev/null
+++ b/test/Analysis/CostModel/X86/fptosi.ll
@@ -0,0 +1,261 @@
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE --check-prefix=SSE42 %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx  -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX1 %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx2 -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX2 %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f -cost-model -analyze < %s | FileCheck --check-prefix=AVX512 --check-prefix=AVX512F %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512dq -cost-model -analyze < %s | FileCheck --check-prefix=AVX512 --check-prefix=AVX512DQ %s
+
+; CHECK-LABEL: 'fptosi_double_i64'
+define i32 @fptosi_double_i64(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I64 = fptosi
+  ; SSE42: cost of 1 {{.*}} %I64 = fptosi
+  ; AVX1: cost of 1 {{.*}} %I64 = fptosi
+  ; AVX2: cost of 1 {{.*}} %I64 = fptosi
+  ; AVX512: cost of 1 {{.*}} %I64 = fptosi
+  %I64 = fptosi double undef to i64
+  ; SSE2: cost of 6 {{.*}} %V2I64 = fptosi
+  ; SSE42: cost of 6 {{.*}} %V2I64 = fptosi
+  ; AVX1: cost of 6 {{.*}} %V2I64 = fptosi
+  ; AVX2: cost of 6 {{.*}} %V2I64 = fptosi
+  ; AVX512F: cost of 6 {{.*}} %V2I64 = fptosi
+  ; AVX512DQ: cost of 1 {{.*}} %V2I64 = fptosi
+  %V2I64 = fptosi <2 x double> undef to <2 x i64>
+  ; SSE2: cost of 13 {{.*}} %V4I64 = fptosi
+  ; SSE42: cost of 13 {{.*}} %V4I64 = fptosi
+  ; AVX1: cost of 12 {{.*}} %V4I64 = fptosi
+  ; AVX2: cost of 12 {{.*}} %V4I64 = fptosi
+  ; AVX512F: cost of 12 {{.*}} %V4I64 = fptosi
+  ; AVX512DQ: cost of 1 {{.*}} %V4I64 = fptosi
+  %V4I64 = fptosi <4 x double> undef to <4 x i64>
+  ; SSE2: cost of 27 {{.*}} %V8I64 = fptosi
+  ; SSE42: cost of 27 {{.*}} %V8I64 = fptosi
+  ; AVX1: cost of 25 {{.*}} %V8I64 = fptosi
+  ; AVX2: cost of 25 {{.*}} %V8I64 = fptosi
+  ; AVX512F: cost of 24 {{.*}} %V8I64 = fptosi
+  ; AVX512DQ: cost of 1 {{.*}} %V8I64 = fptosi
+  %V8I64 = fptosi <8 x double> undef to <8 x i64>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptosi_double_i32'
+define i32 @fptosi_double_i32(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I32 = fptosi
+  ; SSE42: cost of 1 {{.*}} %I32 = fptosi
+  ; AVX1: cost of 1 {{.*}} %I32 = fptosi
+  ; AVX2: cost of 1 {{.*}} %I32 = fptosi
+  ; AVX512: cost of 1 {{.*}} %I32 = fptosi
+  %I32 = fptosi double undef to i32
+  ; SSE2: cost of 3 {{.*}} %V2I32 = fptosi
+  ; SSE42: cost of 3 {{.*}} %V2I32 = fptosi
+  ; AVX1: cost of 3 {{.*}} %V2I32 = fptosi
+  ; AVX2: cost of 3 {{.*}} %V2I32 = fptosi
+  ; AVX512: cost of 3 {{.*}} %V2I32 = fptosi
+  %V2I32 = fptosi <2 x double> undef to <2 x i32>
+  ; SSE2: cost of 7 {{.*}} %V4I32 = fptosi
+  ; SSE42: cost of 7 {{.*}} %V4I32 = fptosi
+  ; AVX1: cost of 1 {{.*}} %V4I32 = fptosi
+  ; AVX2: cost of 1 {{.*}} %V4I32 = fptosi
+  ; AVX512: cost of 1 {{.*}} %V4I32 = fptosi
+  %V4I32 = fptosi <4 x double> undef to <4 x i32>
+  ; SSE2: cost of 15 {{.*}} %V8I32 = fptosi
+  ; SSE42: cost of 15 {{.*}} %V8I32 = fptosi
+  ; AVX1: cost of 3 {{.*}} %V8I32 = fptosi
+  ; AVX2: cost of 3 {{.*}} %V8I32 = fptosi
+  ; AVX512: cost of 1 {{.*}} %V8I32 = fptosi
+  %V8I32 = fptosi <8 x double> undef to <8 x i32>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptosi_double_i16'
+define i32 @fptosi_double_i16(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I16 = fptosi
+  ; SSE42: cost of 1 {{.*}} %I16 = fptosi
+  ; AVX1: cost of 1 {{.*}} %I16 = fptosi
+  ; AVX2: cost of 1 {{.*}} %I16 = fptosi
+  ; AVX512: cost of 1 {{.*}} %I16 = fptosi
+  %I16 = fptosi double undef to i16
+  ; SSE2: cost of 6 {{.*}} %V2I16 = fptosi
+  ; SSE42: cost of 6 {{.*}} %V2I16 = fptosi
+  ; AVX1: cost of 6 {{.*}} %V2I16 = fptosi
+  ; AVX2: cost of 6 {{.*}} %V2I16 = fptosi
+  ; AVX512F: cost of 6 {{.*}} %V2I16 = fptosi
+  ; AVX512DQ: cost of 1 {{.*}} %V2I16 = fptosi
+  %V2I16 = fptosi <2 x double> undef to <2 x i16>
+  ; SSE2: cost of 13 {{.*}} %V4I16 = fptosi
+  ; SSE42: cost of 13 {{.*}} %V4I16 = fptosi
+  ; AVX1: cost of 1 {{.*}} %V4I16 = fptosi
+  ; AVX2: cost of 1 {{.*}} %V4I16 = fptosi
+  ; AVX512: cost of 1 {{.*}} %V4I16 = fptosi
+  %V4I16 = fptosi <4 x double> undef to <4 x i16>
+  ; SSE2: cost of 27 {{.*}} %V8I16 = fptosi
+  ; SSE42: cost of 27 {{.*}} %V8I16 = fptosi
+  ; AVX1: cost of 3 {{.*}} %V8I16 = fptosi
+  ; AVX2: cost of 3 {{.*}} %V8I16 = fptosi
+  ; AVX512: cost of 1 {{.*}} %V8I16 = fptosi
+  %V8I16 = fptosi <8 x double> undef to <8 x i16>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptosi_double_i8'
+define i32 @fptosi_double_i8(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I8 = fptosi
+  ; SSE42: cost of 1 {{.*}} %I8 = fptosi
+  ; AVX1: cost of 1 {{.*}} %I8 = fptosi
+  ; AVX2: cost of 1 {{.*}} %I8 = fptosi
+  ; AVX512: cost of 1 {{.*}} %I8 = fptosi
+  %I8 = fptosi double undef to i8
+  ; SSE2: cost of 6 {{.*}} %V2I8 = fptosi
+  ; SSE42: cost of 6 {{.*}} %V2I8 = fptosi
+  ; AVX1: cost of 6 {{.*}} %V2I8 = fptosi
+  ; AVX2: cost of 6 {{.*}} %V2I8 = fptosi
+  ; AVX512F: cost of 6 {{.*}} %V2I8 = fptosi
+  ; AVX512DQ: cost of 1 {{.*}} %V2I8 = fptosi
+  %V2I8 = fptosi <2 x double> undef to <2 x i8>
+  ; SSE2: cost of 13 {{.*}} %V4I8 = fptosi
+  ; SSE42: cost of 13 {{.*}} %V4I8 = fptosi
+  ; AVX1: cost of 1 {{.*}} %V4I8 = fptosi
+  ; AVX2: cost of 1 {{.*}} %V4I8 = fptosi
+  ; AVX512: cost of 1 {{.*}} %V4I8 = fptosi
+  %V4I8 = fptosi <4 x double> undef to <4 x i8>
+  ; SSE2: cost of 27 {{.*}} %V8I8 = fptosi
+  ; SSE42: cost of 27 {{.*}} %V8I8 = fptosi
+  ; AVX1: cost of 3 {{.*}} %V8I8 = fptosi
+  ; AVX2: cost of 3 {{.*}} %V8I8 = fptosi
+  ; AVX512: cost of 1 {{.*}} %V8I8 = fptosi
+  %V8I8 = fptosi <8 x double> undef to <8 x i8>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptosi_float_i64'
+define i32 @fptosi_float_i64(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I64 = fptosi
+  ; SSE42: cost of 1 {{.*}} %I64 = fptosi
+  ; AVX1: cost of 1 {{.*}} %I64 = fptosi
+  ; AVX2: cost of 1 {{.*}} %I64 = fptosi
+  ; AVX512: cost of 1 {{.*}} %I64 = fptosi
+  %I64 = fptosi float undef to i64
+  ; SSE2: cost of 6 {{.*}} %V2I64 = fptosi
+  ; SSE42: cost of 6 {{.*}} %V2I64 = fptosi
+  ; AVX1: cost of 6 {{.*}} %V2I64 = fptosi
+  ; AVX2: cost of 6 {{.*}} %V2I64 = fptosi
+  ; AVX512F: cost of 6 {{.*}} %V2I64 = fptosi
+  ; AVX512DQ: cost of 1 {{.*}} %V2I64 = fptosi
+  %V2I64 = fptosi <2 x float> undef to <2 x i64>
+  ; SSE2: cost of 13 {{.*}} %V4I64 = fptosi
+  ; SSE42: cost of 13 {{.*}} %V4I64 = fptosi
+  ; AVX1: cost of 12 {{.*}} %V4I64 = fptosi
+  ; AVX2: cost of 12 {{.*}} %V4I64 = fptosi
+  ; AVX512F: cost of 12 {{.*}} %V4I64 = fptosi
+  ; AVX512DQ: cost of 1 {{.*}} %V4I64 = fptosi
+  %V4I64 = fptosi <4 x float> undef to <4 x i64>
+  ; SSE2: cost of 27 {{.*}} %V8I64 = fptosi
+  ; SSE42: cost of 27 {{.*}} %V8I64 = fptosi
+  ; AVX1: cost of 25 {{.*}} %V8I64 = fptosi
+  ; AVX2: cost of 25 {{.*}} %V8I64 = fptosi
+  ; AVX512F: cost of 24 {{.*}} %V8I64 = fptosi
+  ; AVX512DQ: cost of 1 {{.*}} %V8I64 = fptosi
+  %V8I64 = fptosi <8 x float> undef to <8 x i64>
+  ; SSE2: cost of 55 {{.*}} %V16I64 = fptosi
+  ; SSE42: cost of 55 {{.*}} %V16I64 = fptosi
+  ; AVX1: cost of 51 {{.*}} %V16I64 = fptosi
+  ; AVX2: cost of 51 {{.*}} %V16I64 = fptosi
+  ; AVX512F: cost of 49 {{.*}} %V16I64 = fptosi
+  ; AVX512DQ: cost of 3 {{.*}} %V16I64 = fptosi
+  %V16I64 = fptosi <16 x float> undef to <16 x i64>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptosi_float_i32'
+define i32 @fptosi_float_i32(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I32 = fptosi
+  ; SSE42: cost of 1 {{.*}} %I32 = fptosi
+  ; AVX1: cost of 1 {{.*}} %I32 = fptosi
+  ; AVX2: cost of 1 {{.*}} %I32 = fptosi
+  ; AVX512: cost of 1 {{.*}} %I32 = fptosi
+  %I32 = fptosi float undef to i32
+  ; SSE2: cost of 1 {{.*}} %V4I32 = fptosi
+  ; SSE42: cost of 1 {{.*}} %V4I32 = fptosi
+  ; AVX1: cost of 1 {{.*}} %V4I32 = fptosi
+  ; AVX2: cost of 1 {{.*}} %V4I32 = fptosi
+  ; AVX512: cost of 1 {{.*}} %V4I32 = fptosi
+  %V4I32 = fptosi <4 x float> undef to <4 x i32>
+  ; SSE2: cost of 1 {{.*}} %V8I32 = fptosi
+  ; SSE42: cost of 1 {{.*}} %V8I32 = fptosi
+  ; AVX1: cost of 1 {{.*}} %V8I32 = fptosi
+  ; AVX2: cost of 1 {{.*}} %V8I32 = fptosi
+  ; AVX512: cost of 1 {{.*}} %V8I32 = fptosi
+  %V8I32 = fptosi <8 x float> undef to <8 x i32>
+  ; SSE2: cost of 1 {{.*}} %V16I32 = fptosi
+  ; SSE42: cost of 1 {{.*}} %V16I32 = fptosi
+  ; AVX1: cost of 1 {{.*}} %V16I32 = fptosi
+  ; AVX2: cost of 1 {{.*}} %V16I32 = fptosi
+  ; AVX512: cost of 1 {{.*}} %V16I32 = fptosi
+  %V16I32 = fptosi <16 x float> undef to <16 x i32>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptosi_float_i16'
+define i32 @fptosi_float_i16(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I16 = fptosi
+  ; SSE42: cost of 1 {{.*}} %I16 = fptosi
+  ; AVX1: cost of 1 {{.*}} %I16 = fptosi
+  ; AVX2: cost of 1 {{.*}} %I16 = fptosi
+  ; AVX512: cost of 1 {{.*}} %I16 = fptosi
+  %I16 = fptosi float undef to i16
+  ; SSE2: cost of 1 {{.*}} %V4I16 = fptosi
+  ; SSE42: cost of 1 {{.*}} %V4I16 = fptosi
+  ; AVX1: cost of 1 {{.*}} %V4I16 = fptosi
+  ; AVX2: cost of 1 {{.*}} %V4I16 = fptosi
+  ; AVX512: cost of 1 {{.*}} %V4I16 = fptosi
+  %V4I16 = fptosi <4 x float> undef to <4 x i16>
+  ; SSE2: cost of 3 {{.*}} %V8I16 = fptosi
+  ; SSE42: cost of 3 {{.*}} %V8I16 = fptosi
+  ; AVX1: cost of 1 {{.*}} %V8I16 = fptosi
+  ; AVX2: cost of 1 {{.*}} %V8I16 = fptosi
+  ; AVX512: cost of 1 {{.*}} %V8I16 = fptosi
+  %V8I16 = fptosi <8 x float> undef to <8 x i16>
+  ; SSE2: cost of 7 {{.*}} %V16I16 = fptosi
+  ; SSE42: cost of 7 {{.*}} %V16I16 = fptosi
+  ; AVX1: cost of 3 {{.*}} %V16I16 = fptosi
+  ; AVX2: cost of 3 {{.*}} %V16I16 = fptosi
+  ; AVX512: cost of 48 {{.*}} %V16I16 = fptosi
+  %V16I16 = fptosi <16 x float> undef to <16 x i16>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptosi_float_i8'
+define i32 @fptosi_float_i8(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I8 = fptosi
+  ; SSE42: cost of 1 {{.*}} %I8 = fptosi
+  ; AVX1: cost of 1 {{.*}} %I8 = fptosi
+  ; AVX2: cost of 1 {{.*}} %I8 = fptosi
+  ; AVX512: cost of 1 {{.*}} %I8 = fptosi
+  %I8 = fptosi float undef to i8
+  ; SSE2: cost of 1 {{.*}} %V4I8 = fptosi
+  ; SSE42: cost of 1 {{.*}} %V4I8 = fptosi
+  ; AVX1: cost of 1 {{.*}} %V4I8 = fptosi
+  ; AVX2: cost of 1 {{.*}} %V4I8 = fptosi
+  ; AVX512: cost of 1 {{.*}} %V4I8 = fptosi
+  %V4I8 = fptosi <4 x float> undef to <4 x i8>
+  ; SSE2: cost of 3 {{.*}} %V8I8 = fptosi
+  ; SSE42: cost of 3 {{.*}} %V8I8 = fptosi
+  ; AVX1: cost of 7 {{.*}} %V8I8 = fptosi
+  ; AVX2: cost of 7 {{.*}} %V8I8 = fptosi
+  ; AVX512: cost of 7 {{.*}} %V8I8 = fptosi
+  %V8I8 = fptosi <8 x float> undef to <8 x i8>
+  ; SSE2: cost of 7 {{.*}} %V16I8 = fptosi
+  ; SSE42: cost of 7 {{.*}} %V16I8 = fptosi
+  ; AVX1: cost of 15 {{.*}} %V16I8 = fptosi
+  ; AVX2: cost of 15 {{.*}} %V16I8 = fptosi
+  ; AVX512: cost of 48 {{.*}} %V16I8 = fptosi
+  %V16I8 = fptosi <16 x float> undef to <16 x i8>
+
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/fptoui.ll b/test/Analysis/CostModel/X86/fptoui.ll
new file mode 100644
index 000000000000..dbdba30357d9
--- /dev/null
+++ b/test/Analysis/CostModel/X86/fptoui.ll
@@ -0,0 +1,262 @@
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse4.2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE --check-prefix=SSE42 %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx  -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX1 %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx2 -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX2 %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f -cost-model -analyze < %s | FileCheck --check-prefix=AVX512 --check-prefix=AVX512F %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512dq -cost-model -analyze < %s | FileCheck --check-prefix=AVX512 --check-prefix=AVX512DQ %s
+
+; CHECK-LABEL: 'fptoui_double_i64'
+define i32 @fptoui_double_i64(i32 %arg) {
+  ; SSE2: cost of 4 {{.*}} %I64 = fptoui
+  ; SSE42: cost of 4 {{.*}} %I64 = fptoui
+  ; AVX1: cost of 4 {{.*}} %I64 = fptoui
+  ; AVX2: cost of 4 {{.*}} %I64 = fptoui
+  ; AVX512: cost of 1 {{.*}} %I64 = fptoui
+  %I64 = fptoui double undef to i64
+  ; SSE2: cost of 12 {{.*}} %V2I64 = fptoui
+  ; SSE42: cost of 12 {{.*}} %V2I64 = fptoui
+  ; AVX1: cost of 12 {{.*}} %V2I64 = fptoui
+  ; AVX2: cost of 12 {{.*}} %V2I64 = fptoui
+  ; AVX512F: cost of 6 {{.*}} %V2I64 = fptoui
+  ; AVX512DQ: cost of 1 {{.*}} %V2I64 = fptoui
+  %V2I64 = fptoui <2 x double> undef to <2 x i64>
+  ; SSE2: cost of 25 {{.*}} %V4I64 = fptoui
+  ; SSE42: cost of 25 {{.*}} %V4I64 = fptoui
+  ; AVX1: cost of 24 {{.*}} %V4I64 = fptoui
+  ; AVX2: cost of 24 {{.*}} %V4I64 = fptoui
+  ; AVX512F: cost of 12 {{.*}} %V4I64 = fptoui
+  ; AVX512DQ: cost of 1 {{.*}} %V4I64 = fptoui
+  %V4I64 = fptoui <4 x double> undef to <4 x i64>
+  ; SSE2: cost of 51 {{.*}} %V8I64 = fptoui
+  ; SSE42: cost of 51 {{.*}} %V8I64 = fptoui
+  ; AVX1: cost of 49 {{.*}} %V8I64 = fptoui
+  ; AVX2: cost of 49 {{.*}} %V8I64 = fptoui
+  ; AVX512F: cost of 24 {{.*}} %V8I64 = fptoui
+  ; AVX512DQ: cost of 1 {{.*}} %V8I64 = fptoui
+  %V8I64 = fptoui <8 x double> undef to <8 x i64>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptoui_double_i32'
+define i32 @fptoui_double_i32(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I32 = fptoui
+  ; SSE42: cost of 1 {{.*}} %I32 = fptoui
+  ; AVX1: cost of 1 {{.*}} %I32 = fptoui
+  ; AVX2: cost of 1 {{.*}} %I32 = fptoui
+  ; AVX512: cost of 1 {{.*}} %I32 = fptoui
+  %I32 = fptoui double undef to i32
+  ; SSE2: cost of 6 {{.*}} %V2I32 = fptoui
+  ; SSE42: cost of 6 {{.*}} %V2I32 = fptoui
+  ; AVX1: cost of 6 {{.*}} %V2I32 = fptoui
+  ; AVX2: cost of 6 {{.*}} %V2I32 = fptoui
+  ; AVX512F: cost of 6 {{.*}} %V2I32 = fptoui
+  ; AVX512DQ: cost of 1 {{.*}} %V2I32 = fptoui
+  %V2I32 = fptoui <2 x double> undef to <2 x i32>
+  ; SSE2: cost of 13 {{.*}} %V4I32 = fptoui
+  ; SSE42: cost of 13 {{.*}} %V4I32 = fptoui
+  ; AVX1: cost of 16 {{.*}} %V4I32 = fptoui
+  ; AVX2: cost of 16 {{.*}} %V4I32 = fptoui
+  ; AVX512: cost of 16 {{.*}} %V4I32 = fptoui
+  %V4I32 = fptoui <4 x double> undef to <4 x i32>
+  ; SSE2: cost of 27 {{.*}} %V8I32 = fptoui
+  ; SSE42: cost of 27 {{.*}} %V8I32 = fptoui
+  ; AVX1: cost of 33 {{.*}} %V8I32 = fptoui
+  ; AVX2: cost of 33 {{.*}} %V8I32 = fptoui
+  ; AVX512: cost of 1 {{.*}} %V8I32 = fptoui
+  %V8I32 = fptoui <8 x double> undef to <8 x i32>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptoui_double_i16'
+define i32 @fptoui_double_i16(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I16 = fptoui
+  ; SSE42: cost of 1 {{.*}} %I16 = fptoui
+  ; AVX1: cost of 1 {{.*}} %I16 = fptoui
+  ; AVX2: cost of 1 {{.*}} %I16 = fptoui
+  ; AVX512: cost of 1 {{.*}} %I16 = fptoui
+  %I16 = fptoui double undef to i16
+  ; SSE2: cost of 6 {{.*}} %V2I16 = fptoui
+  ; SSE42: cost of 6 {{.*}} %V2I16 = fptoui
+  ; AVX1: cost of 6 {{.*}} %V2I16 = fptoui
+  ; AVX2: cost of 6 {{.*}} %V2I16 = fptoui
+  ; AVX512F: cost of 6 {{.*}} %V2I16 = fptoui
+  ; AVX512DQ: cost of 1 {{.*}} %V2I16 = fptoui
+  %V2I16 = fptoui <2 x double> undef to <2 x i16>
+  ; SSE2: cost of 13 {{.*}} %V4I16 = fptoui
+  ; SSE42: cost of 13 {{.*}} %V4I16 = fptoui
+  ; AVX1: cost of 12 {{.*}} %V4I16 = fptoui
+  ; AVX2: cost of 12 {{.*}} %V4I16 = fptoui
+  ; AVX512: cost of 1 {{.*}} %V4I16 = fptoui
+  %V4I16 = fptoui <4 x double> undef to <4 x i16>
+  ; SSE2: cost of 27 {{.*}} %V8I16 = fptoui
+  ; SSE42: cost of 27 {{.*}} %V8I16 = fptoui
+  ; AVX1: cost of 25 {{.*}} %V8I16 = fptoui
+  ; AVX2: cost of 25 {{.*}} %V8I16 = fptoui
+  ; AVX512: cost of 1 {{.*}} %V8I16 = fptoui
+  %V8I16 = fptoui <8 x double> undef to <8 x i16>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptoui_double_i8'
+define i32 @fptoui_double_i8(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I8 = fptoui
+  ; SSE42: cost of 1 {{.*}} %I8 = fptoui
+  ; AVX1: cost of 1 {{.*}} %I8 = fptoui
+  ; AVX2: cost of 1 {{.*}} %I8 = fptoui
+  ; AVX512: cost of 1 {{.*}} %I8 = fptoui
+  %I8 = fptoui double undef to i8
+  ; SSE2: cost of 6 {{.*}} %V2I8 = fptoui
+  ; SSE42: cost of 6 {{.*}} %V2I8 = fptoui
+  ; AVX1: cost of 6 {{.*}} %V2I8 = fptoui
+  ; AVX2: cost of 6 {{.*}} %V2I8 = fptoui
+  ; AVX512F: cost of 6 {{.*}} %V2I8 = fptoui
+  ; AVX512DQ: cost of 1 {{.*}} %V2I8 = fptoui
+  %V2I8 = fptoui <2 x double> undef to <2 x i8>
+  ; SSE2: cost of 13 {{.*}} %V4I8 = fptoui
+  ; SSE42: cost of 13 {{.*}} %V4I8 = fptoui
+  ; AVX1: cost of 12 {{.*}} %V4I8 = fptoui
+  ; AVX2: cost of 12 {{.*}} %V4I8 = fptoui
+  ; AVX512: cost of 1 {{.*}} %V4I8 = fptoui
+  %V4I8 = fptoui <4 x double> undef to <4 x i8>
+  ; SSE2: cost of 27 {{.*}} %V8I8 = fptoui
+  ; SSE42: cost of 27 {{.*}} %V8I8 = fptoui
+  ; AVX1: cost of 25 {{.*}} %V8I8 = fptoui
+  ; AVX2: cost of 25 {{.*}} %V8I8 = fptoui
+  ; AVX512: cost of 1 {{.*}} %V8I8 = fptoui
+  %V8I8 = fptoui <8 x double> undef to <8 x i8>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptoui_float_i64'
+define i32 @fptoui_float_i64(i32 %arg) {
+  ; SSE2: cost of 4 {{.*}} %I64 = fptoui
+  ; SSE42: cost of 4 {{.*}} %I64 = fptoui
+  ; AVX1: cost of 4 {{.*}} %I64 = fptoui
+  ; AVX2: cost of 4 {{.*}} %I64 = fptoui
+  ; AVX512: cost of 1 {{.*}} %I64 = fptoui
+  %I64 = fptoui float undef to i64
+  ; SSE2: cost of 12 {{.*}} %V2I64 = fptoui
+  ; SSE42: cost of 12 {{.*}} %V2I64 = fptoui
+  ; AVX1: cost of 12 {{.*}} %V2I64 = fptoui
+  ; AVX2: cost of 12 {{.*}} %V2I64 = fptoui
+  ; AVX512F: cost of 6 {{.*}} %V2I64 = fptoui
+  ; AVX512DQ: cost of 1 {{.*}} %V2I64 = fptoui
+  %V2I64 = fptoui <2 x float> undef to <2 x i64>
+  ; SSE2: cost of 25 {{.*}} %V4I64 = fptoui
+  ; SSE42: cost of 25 {{.*}} %V4I64 = fptoui
+  ; AVX1: cost of 24 {{.*}} %V4I64 = fptoui
+  ; AVX2: cost of 24 {{.*}} %V4I64 = fptoui
+  ; AVX512F: cost of 12 {{.*}} %V4I64 = fptoui
+  ; AVX512DQ: cost of 1 {{.*}} %V4I64 = fptoui
+  %V4I64 = fptoui <4 x float> undef to <4 x i64>
+  ; SSE2: cost of 51 {{.*}} %V8I64 = fptoui
+  ; SSE42: cost of 51 {{.*}} %V8I64 = fptoui
+  ; AVX1: cost of 49 {{.*}} %V8I64 = fptoui
+  ; AVX2: cost of 49 {{.*}} %V8I64 = fptoui
+  ; AVX512F: cost of 24 {{.*}} %V8I64 = fptoui
+  ; AVX512DQ: cost of 1 {{.*}} %V8I64 = fptoui
+  %V8I64 = fptoui <8 x float> undef to <8 x i64>
+  ; SSE2: cost of 103 {{.*}} %V16I64 = fptoui
+  ; SSE42: cost of 103 {{.*}} %V16I64 = fptoui
+  ; AVX1: cost of 99 {{.*}} %V16I64 = fptoui
+  ; AVX2: cost of 99 {{.*}} %V16I64 = fptoui
+  ; AVX512F: cost of 49 {{.*}} %V16I64 = fptoui
+  ; AVX512DQ: cost of 3 {{.*}} %V16I64 = fptoui
+  %V16I64 = fptoui <16 x float> undef to <16 x i64>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptoui_float_i32'
+define i32 @fptoui_float_i32(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I32 = fptoui
+  ; SSE42: cost of 1 {{.*}} %I32 = fptoui
+  ; AVX1: cost of 1 {{.*}} %I32 = fptoui
+  ; AVX2: cost of 1 {{.*}} %I32 = fptoui
+  ; AVX512: cost of 1 {{.*}} %I32 = fptoui
+  %I32 = fptoui float undef to i32
+  ; SSE2: cost of 12 {{.*}} %V4I32 = fptoui
+  ; SSE42: cost of 12 {{.*}} %V4I32 = fptoui
+  ; AVX1: cost of 12 {{.*}} %V4I32 = fptoui
+  ; AVX2: cost of 12 {{.*}} %V4I32 = fptoui
+  ; AVX512: cost of 1 {{.*}} %V4I32 = fptoui
+  %V4I32 = fptoui <4 x float> undef to <4 x i32>
+  ; SSE2: cost of 25 {{.*}} %V8I32 = fptoui
+  ; SSE42: cost of 25 {{.*}} %V8I32 = fptoui
+  ; AVX1: cost of 32 {{.*}} %V8I32 = fptoui
+  ; AVX2: cost of 32 {{.*}} %V8I32 = fptoui
+  ; AVX512: cost of 1 {{.*}} %V8I32 = fptoui
+  %V8I32 = fptoui <8 x float> undef to <8 x i32>
+  ; SSE2: cost of 51 {{.*}} %V16I32 = fptoui
+  ; SSE42: cost of 51 {{.*}} %V16I32 = fptoui
+  ; AVX1: cost of 65 {{.*}} %V16I32 = fptoui
+  ; AVX2: cost of 65 {{.*}} %V16I32 = fptoui
+  ; AVX512: cost of 1 {{.*}} %V16I32 = fptoui
+  %V16I32 = fptoui <16 x float> undef to <16 x i32>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptoui_float_i16'
+define i32 @fptoui_float_i16(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I16 = fptoui
+  ; SSE42: cost of 1 {{.*}} %I16 = fptoui
+  ; AVX1: cost of 1 {{.*}} %I16 = fptoui
+  ; AVX2: cost of 1 {{.*}} %I16 = fptoui
+  ; AVX512: cost of 1 {{.*}} %I16 = fptoui
+  %I16 = fptoui float undef to i16
+  ; SSE2: cost of 12 {{.*}} %V4I16 = fptoui
+  ; SSE42: cost of 12 {{.*}} %V4I16 = fptoui
+  ; AVX1: cost of 12 {{.*}} %V4I16 = fptoui
+  ; AVX2: cost of 12 {{.*}} %V4I16 = fptoui
+  ; AVX512: cost of 1 {{.*}} %V4I16 = fptoui
+  %V4I16 = fptoui <4 x float> undef to <4 x i16>
+  ; SSE2: cost of 25 {{.*}} %V8I16 = fptoui
+  ; SSE42: cost of 25 {{.*}} %V8I16 = fptoui
+  ; AVX1: cost of 1 {{.*}} %V8I16 = fptoui
+  ; AVX2: cost of 1 {{.*}} %V8I16 = fptoui
+  ; AVX512: cost of 1 {{.*}} %V8I16 = fptoui
+  %V8I16 = fptoui <8 x float> undef to <8 x i16>
+  ; SSE2: cost of 51 {{.*}} %V16I16 = fptoui
+  ; SSE42: cost of 51 {{.*}} %V16I16 = fptoui
+  ; AVX1: cost of 3 {{.*}} %V16I16 = fptoui
+  ; AVX2: cost of 3 {{.*}} %V16I16 = fptoui
+  ; AVX512: cost of 48 {{.*}} %V16I16 = fptoui
+  %V16I16 = fptoui <16 x float> undef to <16 x i16>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'fptoui_float_i8'
+define i32 @fptoui_float_i8(i32 %arg) {
+  ; SSE2: cost of 1 {{.*}} %I8 = fptoui
+  ; SSE42: cost of 1 {{.*}} %I8 = fptoui
+  ; AVX1: cost of 1 {{.*}} %I8 = fptoui
+  ; AVX2: cost of 1 {{.*}} %I8 = fptoui
+  ; AVX512: cost of 1 {{.*}} %I8 = fptoui
+  %I8 = fptoui float undef to i8
+  ; SSE2: cost of 12 {{.*}} %V4I8 = fptoui
+  ; SSE42: cost of 12 {{.*}} %V4I8 = fptoui
+  ; AVX1: cost of 12 {{.*}} %V4I8 = fptoui
+  ; AVX2: cost of 12 {{.*}} %V4I8 = fptoui
+  ; AVX512: cost of 1 {{.*}} %V4I8 = fptoui
+  %V4I8 = fptoui <4 x float> undef to <4 x i8>
+  ; SSE2: cost of 25 {{.*}} %V8I8 = fptoui
+  ; SSE42: cost of 25 {{.*}} %V8I8 = fptoui
+  ; AVX1: cost of 1 {{.*}} %V8I8 = fptoui
+  ; AVX2: cost of 1 {{.*}} %V8I8 = fptoui
+  ; AVX512: cost of 1 {{.*}} %V8I8 = fptoui
+  %V8I8 = fptoui <8 x float> undef to <8 x i8>
+  ; SSE2: cost of 51 {{.*}} %V16I8 = fptoui
+  ; SSE42: cost of 51 {{.*}} %V16I8 = fptoui
+  ; AVX1: cost of 3 {{.*}} %V16I8 = fptoui
+  ; AVX2: cost of 3 {{.*}} %V16I8 = fptoui
+  ; AVX512: cost of 48 {{.*}} %V16I8 = fptoui
+  %V16I8 = fptoui <16 x float> undef to <16 x i8>
+
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/interleave-load-i32.ll b/test/Analysis/CostModel/X86/interleave-load-i32.ll
new file mode 100755
index 000000000000..3c94d8c446f9
--- /dev/null
+++ b/test/Analysis/CostModel/X86/interleave-load-i32.ll
@@ -0,0 +1,85 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s 
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i32] zeroinitializer, align 16
+@B = global [10240 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i32_interleave4() {
+;CHECK-LABEL: load_i32_interleave4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 5 for VF 2 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 8 for VF 8 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 22 for VF 16 For instruction:   %0 = load
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 16
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %2, %0
+  %3 = or i64 %indvars.iv, 2
+  %arrayidx6 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %3
+  %4 = load i32, i32* %arrayidx6, align 8
+  %add7 = add nsw i32 %add3, %4
+  %5 = or i64 %indvars.iv, 3
+  %arrayidx10 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %5
+  %6 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %add7, %6
+  %arrayidx13 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %add11, i32* %arrayidx13, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+define void @load_i32_interleave5() {
+;CHECK-LABEL: load_i32_interleave5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 6 for VF 2 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 9 for VF 4 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 18 for VF 8 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 35 for VF 16 For instruction:   %0 = load
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %2, %0
+  %3 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx6 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %3
+  %4 = load i32, i32* %arrayidx6, align 4
+  %add7 = add nsw i32 %add3, %4
+  %5 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx10 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %5
+  %6 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %add7, %6
+  %7 = add nuw nsw i64 %indvars.iv, 4
+  %arrayidx14 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %7
+  %8 = load i32, i32* %arrayidx14, align 4
+  %add15 = add nsw i32 %add11, %8
+  %arrayidx17 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %add15, i32* %arrayidx17, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
diff --git a/test/Analysis/CostModel/X86/interleave-store-i32.ll b/test/Analysis/CostModel/X86/interleave-store-i32.ll
new file mode 100755
index 000000000000..e3076bfa294b
--- /dev/null
+++ b/test/Analysis/CostModel/X86/interleave-store-i32.ll
@@ -0,0 +1,85 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i32] zeroinitializer, align 16
+@B = global [10240 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @store_i32_interleave4() {
+;CHECK-LABEL: store_i32_interleave4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add16
+;CHECK: Found an estimated cost of 5 for VF 2 For instruction:   store i32 %add16
+;CHECK: Found an estimated cost of 5 for VF 4 For instruction:   store i32 %add16
+;CHECK: Found an estimated cost of 11 for VF 8 For instruction:   store i32 %add16
+;CHECK: Found an estimated cost of 22 for VF 16 For instruction:   store i32 %add16
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 16
+  %add = add nsw i32 %0, 1
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %1
+  store i32 %add, i32* %arrayidx7, align 4
+  %add10 = add nsw i32 %0, 2
+  %2 = or i64 %indvars.iv, 2
+  %arrayidx13 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %2
+  store i32 %add10, i32* %arrayidx13, align 8
+  %add16 = add nsw i32 %0, 3
+  %3 = or i64 %indvars.iv, 3
+  %arrayidx19 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %3
+  store i32 %add16, i32* %arrayidx19, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+define void @store_i32_interleave5() {
+;CHECK-LABEL: store_i32_interleave5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add22
+;CHECK: Found an estimated cost of 7 for VF 2 For instruction:   store i32 %add22
+;CHECK: Found an estimated cost of 14 for VF 4 For instruction:   store i32 %add22
+;CHECK: Found an estimated cost of 21 for VF 8 For instruction:   store i32 %add22
+;CHECK: Found an estimated cost of 35 for VF 16 For instruction:   store i32 %add22
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %add = add nsw i32 %0, 1
+  %1 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %1
+  store i32 %add, i32* %arrayidx7, align 4
+  %add10 = add nsw i32 %0, 2
+  %2 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx13 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %2
+  store i32 %add10, i32* %arrayidx13, align 4
+  %add16 = add nsw i32 %0, 3
+  %3 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx19 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %3
+  store i32 %add16, i32* %arrayidx19, align 4
+  %add22 = add nsw i32 %0, 4
+  %4 = add nuw nsw i64 %indvars.iv, 4
+  %arrayidx25 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %4
+  store i32 %add22, i32* %arrayidx25, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
diff --git a/test/Analysis/CostModel/X86/reduction.ll b/test/Analysis/CostModel/X86/reduction.ll
index aaafe07c1eb8..45e2215cd36a 100644
--- a/test/Analysis/CostModel/X86/reduction.ll
+++ b/test/Analysis/CostModel/X86/reduction.ll
@@ -33,7 +33,9 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
   %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
 
 ; CHECK-LABEL: reduction_cost_int
-; CHECK:  cost of 17 {{.*}} extractelement
+; CHECK:  cost of 11 {{.*}} extractelement
+; AVX-LABEL: reduction_cost_int
+; AVX:  cost of 5 {{.*}} extractelement
 
   %r = extractelement <8 x i32> %bin.rdx.3, i32 0
   ret i32 %r
diff --git a/test/Analysis/CostModel/X86/rem.ll b/test/Analysis/CostModel/X86/rem.ll
new file mode 100644
index 000000000000..10ce6775576f
--- /dev/null
+++ b/test/Analysis/CostModel/X86/rem.ll
@@ -0,0 +1,116 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSSE3
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE42
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: 'srem'
+define i32 @srem() {
+  ; CHECK: cost of 1 {{.*}} %I64 = srem
+  %I64 = srem i64 undef, undef
+  ; SSE: cost of 6 {{.*}} %V2i64 = srem
+  ; AVX: cost of 6 {{.*}} %V2i64 = srem
+  %V2i64 = srem <2 x i64> undef, undef
+  ; SSE: cost of 12 {{.*}} %V4i64 = srem
+  ; AVX: cost of 12 {{.*}} %V4i64 = srem
+  %V4i64 = srem <4 x i64> undef, undef
+  ; SSE: cost of 24 {{.*}} %V8i64 = srem
+  ; AVX: cost of 24 {{.*}} %V8i64 = srem
+  %V8i64 = srem <8 x i64> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I32 = srem
+  %I32 = srem i32 undef, undef
+  ; SSE: cost of 12 {{.*}} %V4i32 = srem
+  ; AVX: cost of 12 {{.*}} %V4i32 = srem
+  %V4i32 = srem <4 x i32> undef, undef
+  ; SSE: cost of 24 {{.*}} %V8i32 = srem
+  ; AVX: cost of 24 {{.*}} %V8i32 = srem
+  %V8i32 = srem <8 x i32> undef, undef
+  ; SSE: cost of 48 {{.*}} %V16i32 = srem
+  ; AVX: cost of 48 {{.*}} %V16i32 = srem
+  %V16i32 = srem <16 x i32> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I16 = srem
+  %I16 = srem i16 undef, undef
+  ; SSE: cost of 24 {{.*}} %V8i16 = srem
+  ; AVX: cost of 24 {{.*}} %V8i16 = srem
+  %V8i16 = srem <8 x i16> undef, undef
+  ; SSE: cost of 48 {{.*}} %V16i16 = srem
+  ; AVX: cost of 48 {{.*}} %V16i16 = srem
+  %V16i16 = srem <16 x i16> undef, undef
+  ; SSE: cost of 96 {{.*}} %V32i16 = srem
+  ; AVX: cost of 96 {{.*}} %V32i16 = srem
+  %V32i16 = srem <32 x i16> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I8 = srem
+  %I8 = srem i8 undef, undef
+  ; SSE: cost of 48 {{.*}} %V16i8 = srem
+  ; AVX: cost of 48 {{.*}} %V16i8 = srem
+  %V16i8 = srem <16 x i8> undef, undef
+  ; SSE: cost of 96 {{.*}} %V32i8 = srem
+  ; AVX: cost of 96 {{.*}} %V32i8 = srem
+  %V32i8 = srem <32 x i8> undef, undef
+  ; SSE: cost of 192 {{.*}} %V64i8 = srem
+  ; AVX: cost of 192 {{.*}} %V64i8 = srem
+  %V64i8 = srem <64 x i8> undef, undef
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'urem'
+define i32 @urem() {
+  ; CHECK: cost of 1 {{.*}} %I64 = urem
+  %I64 = urem i64 undef, undef
+  ; SSE: cost of 6 {{.*}} %V2i64 = urem
+  ; AVX: cost of 6 {{.*}} %V2i64 = urem
+  %V2i64 = urem <2 x i64> undef, undef
+  ; SSE: cost of 12 {{.*}} %V4i64 = urem
+  ; AVX: cost of 12 {{.*}} %V4i64 = urem
+  %V4i64 = urem <4 x i64> undef, undef
+  ; SSE: cost of 24 {{.*}} %V8i64 = urem
+  ; AVX: cost of 24 {{.*}} %V8i64 = urem
+  %V8i64 = urem <8 x i64> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I32 = urem
+  %I32 = urem i32 undef, undef
+  ; SSE: cost of 12 {{.*}} %V4i32 = urem
+  ; AVX: cost of 12 {{.*}} %V4i32 = urem
+  %V4i32 = urem <4 x i32> undef, undef
+  ; SSE: cost of 24 {{.*}} %V8i32 = urem
+  ; AVX: cost of 24 {{.*}} %V8i32 = urem
+  %V8i32 = urem <8 x i32> undef, undef
+  ; SSE: cost of 48 {{.*}} %V16i32 = urem
+  ; AVX: cost of 48 {{.*}} %V16i32 = urem
+  %V16i32 = urem <16 x i32> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I16 = urem
+  %I16 = urem i16 undef, undef
+  ; SSE: cost of 24 {{.*}} %V8i16 = urem
+  ; AVX: cost of 24 {{.*}} %V8i16 = urem
+  %V8i16 = urem <8 x i16> undef, undef
+  ; SSE: cost of 48 {{.*}} %V16i16 = urem
+  ; AVX: cost of 48 {{.*}} %V16i16 = urem
+  %V16i16 = urem <16 x i16> undef, undef
+  ; SSE: cost of 96 {{.*}} %V32i16 = urem
+  ; AVX: cost of 96 {{.*}} %V32i16 = urem
+  %V32i16 = urem <32 x i16> undef, undef
+
+  ; CHECK: cost of 1 {{.*}} %I8 = urem
+  %I8 = urem i8 undef, undef
+  ; SSE: cost of 48 {{.*}} %V16i8 = urem
+  ; AVX: cost of 48 {{.*}} %V16i8 = urem
+  %V16i8 = urem <16 x i8> undef, undef
+  ; SSE: cost of 96 {{.*}} %V32i8 = urem
+  ; AVX: cost of 96 {{.*}} %V32i8 = urem
+  %V32i8 = urem <32 x i8> undef, undef
+  ; SSE: cost of 192 {{.*}} %V64i8 = urem
+  ; AVX: cost of 192 {{.*}} %V64i8 = urem
+  %V64i8 = urem <64 x i8> undef, undef
+
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/scalarize.ll b/test/Analysis/CostModel/X86/scalarize.ll
index fd4e3005bd9f..53808871dd64 100644
--- a/test/Analysis/CostModel/X86/scalarize.ll
+++ b/test/Analysis/CostModel/X86/scalarize.ll
@@ -28,11 +28,11 @@ define void @test_scalarized_intrinsics() {
 ; CHECK64: cost of 1 {{.*}}bswap.v2i64
         %r3 = call %i8 @llvm.bswap.v2i64(%i8 undef)
 
-; CHECK32: cost of 12 {{.*}}cttz.v4i32
-; CHECK64: cost of 12 {{.*}}cttz.v4i32
+; CHECK32: cost of 14 {{.*}}cttz.v4i32
+; CHECK64: cost of 14 {{.*}}cttz.v4i32
         %r4 = call %i4 @llvm.cttz.v4i32(%i4 undef)
 ; CHECK32: cost of 10 {{.*}}cttz.v2i64
-; CHECK64: cost of 6 {{.*}}cttz.v2i64
+; CHECK64: cost of 10 {{.*}}cttz.v2i64
         %r5 = call %i8 @llvm.cttz.v2i64(%i8 undef)
 
 ; CHECK32: ret
diff --git a/test/Analysis/CostModel/X86/shuffle-broadcast.ll b/test/Analysis/CostModel/X86/shuffle-broadcast.ll
new file mode 100644
index 000000000000..a829a47f89f2
--- /dev/null
+++ b/test/Analysis/CostModel/X86/shuffle-broadcast.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSSE3
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE42
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; Verify the cost model for broadcast shuffles.
+;
+
+; CHECK-LABEL: 'test_vXf64'
+define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
+
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
+
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/shuffle-reverse.ll b/test/Analysis/CostModel/X86/shuffle-reverse.ll
new file mode 100644
index 000000000000..a1bdda0690aa
--- /dev/null
+++ b/test/Analysis/CostModel/X86/shuffle-reverse.ll
@@ -0,0 +1,168 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSSE3
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE42
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+;
+; Verify the cost model for reverse shuffles.
+;
+
+; CHECK-LABEL: 'test_vXf64'
+define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+
+  ; SSE: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+
+  ; SSE: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi64'
+define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+
+  ; SSE: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+
+  ; SSE: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXf32'
+define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
+  ; SSE: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V64 = shufflevector
+  %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+
+  ; SSE: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SSE: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi32'
+define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) {
+  ; SSE: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V64 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V64 = shufflevector
+  %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+
+  ; SSE: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SSE: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 2 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi16'
+define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
+  ; SSE2: cost of 3 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SSE2: cost of 6 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 2 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 4 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX512F: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SSE2: cost of 12 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 4 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 8 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi8'
+define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
+  ; SSE2: cost of 9 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SSE2: cost of 18 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 2 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 4 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 2 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 2 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SSE2: cost of 36 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 4 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 8 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 4 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 6 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/shuffle-single-src.ll b/test/Analysis/CostModel/X86/shuffle-single-src.ll
new file mode 100644
index 000000000000..a953ec17d80f
--- /dev/null
+++ b/test/Analysis/CostModel/X86/shuffle-single-src.ll
@@ -0,0 +1,94 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake-avx512 | FileCheck %s --check-prefix=SKX
+
+;
+; Verify the cost model for 1 src shuffles
+;
+
+; SKX-LABEL: 'test_vXf64'
+define void @test_vXf64(<4 x double> %src256, <8 x double> %src512, <16 x double> %src1024) {
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 2 {{.*}} %V1024 = shufflevector
+  %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; SKX-LABEL: 'test_vXi64'
+define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
+
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXf32'
+define void @test_vXf32(<4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
+
+  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi32'
+define void @test_vXi32(<4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024) {
+
+  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 13, i32 10, i32 9, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 2 {{.*}} %V1024 = shufflevector
+  %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi16'
+define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512, <64 x i16> %src1024) {
+
+  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 2 {{.*}} %V1024 = shufflevector
+  %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi8'
+define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
+  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 3 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 8 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/shuffle-two-src.ll b/test/Analysis/CostModel/X86/shuffle-two-src.ll
new file mode 100644
index 000000000000..de79a82e66ae
--- /dev/null
+++ b/test/Analysis/CostModel/X86/shuffle-two-src.ll
@@ -0,0 +1,68 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake-avx512 | FileCheck %s --check-prefix=SKX
+
+;
+; Verify the cost model for 2 src shuffles
+;
+
+; SKX-LABEL: 'test_vXf64'
+define void @test_vXf64(<4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+
+  ; SKX: cost of 6 {{.*}} %V1024 = shufflevector
+  %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXf32'
+define void @test_vXf32(<4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
+
+  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 6 {{.*}} %V1024 = shufflevector
+  %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi16'
+define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512, <64 x i16> %src1024, <8 x i16> %src128_1, <16 x i16> %src256_1, <32 x i16> %src512_1, <64 x i16> %src1024_1) {
+
+  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 6 {{.*}} %V1024 = shufflevector
+  %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi8'
+define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512, <16 x i8> %src128_1, <32 x i8> %src256_1, <64 x i8> %src512_1) {
+  ; SKX: cost of 3 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 3 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 19 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/sitofp.ll b/test/Analysis/CostModel/X86/sitofp.ll
index fb390a2b17aa..a30cb5f7e823 100644
--- a/test/Analysis/CostModel/X86/sitofp.ll
+++ b/test/Analysis/CostModel/X86/sitofp.ll
@@ -1,678 +1,250 @@
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx  -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX1 %s
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx2 -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX2 %s
-; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f -cost-model -analyze < %s | FileCheck --check-prefix=AVX512F %s
-
-define <2 x double> @sitofpv2i8v2double(<2 x i8> %a) {
-  ; SSE2-LABEL: sitofpv2i8v2double
-  ; SSE2: cost of 20 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv2i8v2double
-  ; AVX1: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv2i8v2double
-  ; AVX2: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv2i8v2double
-  ; AVX512F: cost of 4 {{.*}} sitofp
-  %1 = sitofp <2 x i8> %a to <2 x double>
-  ret <2 x double> %1
-}
-
-define <4 x double> @sitofpv4i8v4double(<4 x i8> %a) {
-  ; SSE2-LABEL: sitofpv4i8v4double
-  ; SSE2: cost of 40 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv4i8v4double
-  ; AVX1: cost of 3 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv4i8v4double
-  ; AVX2: cost of 3 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv4i8v4double
-  ; AVX512F: cost of 3 {{.*}} sitofp
-  %1 = sitofp <4 x i8> %a to <4 x double>
-  ret <4 x double> %1
-}
-
-define <8 x double> @sitofpv8i8v8double(<8 x i8> %a) {
-  ; SSE2-LABEL: sitofpv8i8v8double
-  ; SSE2: cost of 80 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv8i8v8double
-  ; AVX1: cost of 7 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv8i8v8double
-  ; AVX2: cost of 7 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv8i8v8double
-  ; AVX512F: cost of 2 {{.*}} sitofp
-  %1 = sitofp <8 x i8> %a to <8 x double>
-  ret <8 x double> %1
-}
-
-define <16 x double> @sitofpv16i8v16double(<16 x i8> %a) {
-  ; SSE2-LABEL: sitofpv16i8v16double
-  ; SSE2: cost of 160 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv16i8v16double
-  ; AVX1: cost of 15 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv16i8v16double
-  ; AVX2: cost of 15 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv16i8v16double
-  ; AVX512F: cost of 5 {{.*}} sitofp
-  %1 = sitofp <16 x i8> %a to <16 x double>
-  ret <16 x double> %1
-}
-
-define <32 x double> @sitofpv32i8v32double(<32 x i8> %a) {
-  ; SSE2-LABEL: sitofpv32i8v32double
-  ; SSE2: cost of 320 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv32i8v32double
-  ; AVX1: cost of 31 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv32i8v32double
-  ; AVX2: cost of 31 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv32i8v32double
-  ; AVX512F: cost of 11 {{.*}} sitofp
-  %1 = sitofp <32 x i8> %a to <32 x double>
-  ret <32 x double> %1
-}
-
-define <2 x double> @sitofpv2i16v2double(<2 x i16> %a) {
-  ; SSE2-LABEL: sitofpv2i16v2double
-  ; SSE2: cost of 20 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv2i16v2double
-  ; AVX1: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv2i16v2double
-  ; AVX2: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv2i16v2double
-  ; AVX512F: cost of 4 {{.*}} sitofp
-  %1 = sitofp <2 x i16> %a to <2 x double>
-  ret <2 x double> %1
-}
-
-define <4 x double> @sitofpv4i16v4double(<4 x i16> %a) {
-  ; SSE2-LABEL: sitofpv4i16v4double
-  ; SSE2: cost of 40 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv4i16v4double
-  ; AVX1: cost of 3 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv4i16v4double
-  ; AVX2: cost of 3 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv4i16v4double
-  ; AVX512F: cost of 3 {{.*}} sitofp
-  %1 = sitofp <4 x i16> %a to <4 x double>
-  ret <4 x double> %1
-}
-
-define <8 x double> @sitofpv8i16v8double(<8 x i16> %a) {
-  ; SSE2-LABEL: sitofpv8i16v8double
-  ; SSE2: cost of 80 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv8i16v8double
-  ; AVX1: cost of 7 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv8i16v8double
-  ; AVX2: cost of 7 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv8i16v8double
-  ; AVX512F: cost of 2 {{.*}} sitofp
-  %1 = sitofp <8 x i16> %a to <8 x double>
-  ret <8 x double> %1
-}
-
-define <16 x double> @sitofpv16i16v16double(<16 x i16> %a) {
-  ; SSE2-LABEL: sitofpv16i16v16double
-  ; SSE2: cost of 160 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv16i16v16double
-  ; AVX1: cost of 15 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv16i16v16double
-  ; AVX2: cost of 15 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv16i16v16double
-  ; AVX512F: cost of 5 {{.*}} sitofp
-  %1 = sitofp <16 x i16> %a to <16 x double>
-  ret <16 x double> %1
-}
-
-define <32 x double> @sitofpv32i16v32double(<32 x i16> %a) {
-  ; SSE2-LABEL: sitofpv32i16v32double
-  ; SSE2: cost of 320 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv32i16v32double
-  ; AVX1: cost of 31 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv32i16v32double
-  ; AVX2: cost of 31 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv32i16v32double
-  ; AVX512F: cost of 11 {{.*}} sitofp
-  %1 = sitofp <32 x i16> %a to <32 x double>
-  ret <32 x double> %1
-}
-
-define <2 x double> @sitofpv2i32v2double(<2 x i32> %a) {
-  ; SSE2-LABEL: sitofpv2i32v2double
-  ; SSE2: cost of 20 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv2i32v2double
-  ; AVX1: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv2i32v2double
-  ; AVX2: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv2i32v2double
-  ; AVX512F: cost of 4 {{.*}} sitofp
-  %1 = sitofp <2 x i32> %a to <2 x double>
-  ret <2 x double> %1
-}
-
-define <4 x double> @sitofpv4i32v4double(<4 x i32> %a) {
-  ; SSE2-LABEL: sitofpv4i32v4double
-  ; SSE2: cost of 40 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv4i32v4double
-  ; AVX1: cost of 1 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv4i32v4double
-  ; AVX2: cost of 1 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv4i32v4double
-  ; AVX512F: cost of 1 {{.*}} sitofp
-  %1 = sitofp <4 x i32> %a to <4 x double>
-  ret <4 x double> %1
-}
-
-define <8 x double> @sitofpv8i32v8double(<8 x i32> %a) {
-  ; SSE2-LABEL: sitofpv8i32v8double
-  ; SSE2: cost of 80 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv8i32v8double
-  ; AVX1: cost of 3 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv8i32v8double
-  ; AVX2: cost of 3 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv8i32v8double
-  ; AVX512F: cost of 1 {{.*}} sitofp
-  %1 = sitofp <8 x i32> %a to <8 x double>
-  ret <8 x double> %1
-}
-
-define <16 x double> @sitofpv16i32v16double(<16 x i32> %a) {
-  ; SSE2-LABEL: sitofpv16i32v16double
-  ; SSE2: cost of 160 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv16i32v16double
-  ; AVX1: cost of 7 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv16i32v16double
-  ; AVX2: cost of 7 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv16i32v16double
-  ; AVX512F: cost of 3 {{.*}} sitofp
-  %1 = sitofp <16 x i32> %a to <16 x double>
-  ret <16 x double> %1
-}
-
-define <32 x double> @sitofpv32i32v32double(<32 x i32> %a) {
-  ; SSE2-LABEL: sitofpv32i32v32double
-  ; SSE2: cost of 320 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv32i32v32double
-  ; AVX1: cost of 15 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv32i32v32double
-  ; AVX2: cost of 15 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv32i32v32double
-  ; AVX512F: cost of 7 {{.*}} sitofp
-  %1 = sitofp <32 x i32> %a to <32 x double>
-  ret <32 x double> %1
-}
-
-define <2 x double> @sitofpv2i64v2double(<2 x i64> %a) {
-  ; SSE2-LABEL: sitofpv2i64v2double
-  ; SSE2: cost of 20 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv2i64v2double
-  ; AVX1: cost of 20 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv2i64v2double
-  ; AVX2: cost of 20 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv2i64v2double
-  ; AVX512F: cost of 20 {{.*}} sitofp
-  %1 = sitofp <2 x i64> %a to <2 x double>
-  ret <2 x double> %1
-}
-
-define <4 x double> @sitofpv4i64v4double(<4 x i64> %a) {
-  ; SSE2-LABEL: sitofpv4i64v4double
-  ; SSE2: cost of 40 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv4i64v4double
-  ; AVX1: cost of 13 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv4i64v4double
-  ; AVX2: cost of 13 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv4i64v4double
-  ; AVX512F: cost of 13 {{.*}} sitofp
-  %1 = sitofp <4 x i64> %a to <4 x double>
-  ret <4 x double> %1
-}
-
-define <8 x double> @sitofpv8i64v8double(<8 x i64> %a) {
-  ; SSE2-LABEL: sitofpv8i64v8double
-  ; SSE2: cost of 80 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv8i64v8double
-  ; AVX1: cost of 27 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv8i64v8double
-  ; AVX2: cost of 27 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv8i64v8double
-  ; AVX512F: cost of 22 {{.*}} sitofp
-  %1 = sitofp <8 x i64> %a to <8 x double>
-  ret <8 x double> %1
-}
-
-define <16 x double> @sitofpv16i64v16double(<16 x i64> %a) {
-  ; SSE2-LABEL: sitofpv16i64v16double
-  ; SSE2: cost of 160 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv16i64v16double
-  ; AVX1: cost of 55 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv16i64v16double
-  ; AVX2: cost of 55 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv16i64v16double
-  ; AVX512F: cost of 45 {{.*}} sitofp
-  %1 = sitofp <16 x i64> %a to <16 x double>
-  ret <16 x double> %1
-}
-
-define <32 x double> @sitofpv32i64v32double(<32 x i64> %a) {
-  ; SSE2-LABEL: sitofpv32i64v32double
-  ; SSE2: cost of 320 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv32i64v32double
-  ; AVX1: cost of 111 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv32i64v32double
-  ; AVX2: cost of 111 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv32i64v32double
-  ; AVX512F: cost of 91 {{.*}} sitofp
-  %1 = sitofp <32 x i64> %a to <32 x double>
-  ret <32 x double> %1
-}
-
-define <2 x float> @sitofpv2i8v2float(<2 x i8> %a) {
-  ; SSE2-LABEL: sitofpv2i8v2float
-  ; SSE2: cost of 15 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv2i8v2float
-  ; AVX1: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv2i8v2float
-  ; AVX2: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv2i8v2float
-  ; AVX512F: cost of 4 {{.*}} sitofp
-  %1 = sitofp <2 x i8> %a to <2 x float>
-  ret <2 x float> %1
-}
-
-define <4 x float> @sitofpv4i8v4float(<4 x i8> %a) {
-  ; SSE2-LABEL: sitofpv4i8v4float
-  ; SSE2: cost of 5 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv4i8v4float
-  ; AVX1: cost of 3 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv4i8v4float
-  ; AVX2: cost of 3 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv4i8v4float
-  ; AVX512F: cost of 3 {{.*}} sitofp
-  %1 = sitofp <4 x i8> %a to <4 x float>
-  ret <4 x float> %1
-}
-
-define <8 x float> @sitofpv8i8v8float(<8 x i8> %a) {
-  ; SSE2-LABEL: sitofpv8i8v8float
-  ; SSE2: cost of 15 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv8i8v8float
-  ; AVX1: cost of 8 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv8i8v8float
-  ; AVX2: cost of 8 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv8i8v8float
-  ; AVX512F: cost of 8 {{.*}} sitofp
-  %1 = sitofp <8 x i8> %a to <8 x float>
-  ret <8 x float> %1
-}
-
-define <16 x float> @sitofpv16i8v16float(<16 x i8> %a) {
-  ; SSE2-LABEL: sitofpv16i8v16float
-  ; SSE2: cost of 8 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv16i8v16float
-  ; AVX1: cost of 17 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv16i8v16float
-  ; AVX2: cost of 17 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv16i8v16float
-  ; AVX512F: cost of 2 {{.*}} sitofp
-  %1 = sitofp <16 x i8> %a to <16 x float>
-  ret <16 x float> %1
-}
-
-define <32 x float> @sitofpv32i8v32float(<32 x i8> %a) {
-  ; SSE2-LABEL: sitofpv32i8v32float
-  ; SSE2: cost of 16 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv32i8v32float
-  ; AVX1: cost of 35 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv32i8v32float
-  ; AVX2: cost of 35 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv32i8v32float
-  ; AVX512F: cost of 5 {{.*}} sitofp
-  %1 = sitofp <32 x i8> %a to <32 x float>
-  ret <32 x float> %1
-}
-
-define <2 x float> @sitofpv2i16v2float(<2 x i16> %a) {
-  ; SSE2-LABEL: sitofpv2i16v2float
-  ; SSE2: cost of 15 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv2i16v2float
-  ; AVX1: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv2i16v2float
-  ; AVX2: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv2i16v2float
-  ; AVX512F: cost of 4 {{.*}} sitofp
-  %1 = sitofp <2 x i16> %a to <2 x float>
-  ret <2 x float> %1
-}
-
-define <4 x float> @sitofpv4i16v4float(<4 x i16> %a) {
-  ; SSE2-LABEL: sitofpv4i16v4float
-  ; SSE2: cost of 5 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv4i16v4float
-  ; AVX1: cost of 3 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv4i16v4float
-  ; AVX2: cost of 3 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv4i16v4float
-  ; AVX512F: cost of 3 {{.*}} sitofp
-  %1 = sitofp <4 x i16> %a to <4 x float>
-  ret <4 x float> %1
-}
-
-define <8 x float> @sitofpv8i16v8float(<8 x i16> %a) {
-  ; SSE2-LABEL: sitofpv8i16v8float
-  ; SSE2: cost of 15 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv8i16v8float
-  ; AVX1: cost of 5 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv8i16v8float
-  ; AVX2: cost of 5 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv8i16v8float
-  ; AVX512F: cost of 5 {{.*}} sitofp
-  %1 = sitofp <8 x i16> %a to <8 x float>
-  ret <8 x float> %1
-}
-
-define <16 x float> @sitofpv16i16v16float(<16 x i16> %a) {
-  ; SSE2-LABEL: sitofpv16i16v16float
-  ; SSE2: cost of 30 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv16i16v16float
-  ; AVX1: cost of 11 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv16i16v16float
-  ; AVX2: cost of 11 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv16i16v16float
-  ; AVX512F: cost of 2 {{.*}} sitofp
-  %1 = sitofp <16 x i16> %a to <16 x float>
-  ret <16 x float> %1
-}
-
-define <32 x float> @sitofpv32i16v32float(<32 x i16> %a) {
-  ; SSE2-LABEL: sitofpv32i16v32float
-  ; SSE2: cost of 60 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv32i16v32float
-  ; AVX1: cost of 23 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv32i16v32float
-  ; AVX2: cost of 23 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv32i16v32float
-  ; AVX512F: cost of 5 {{.*}} sitofp
-  %1 = sitofp <32 x i16> %a to <32 x float>
-  ret <32 x float> %1
-}
-
-define <2 x float> @sitofpv2i32v2float(<2 x i32> %a) {
-  ; SSE2-LABEL: sitofpv2i32v2float
-  ; SSE2: cost of 15 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv2i32v2float
-  ; AVX1: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv2i32v2float
-  ; AVX2: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv2i32v2float
-  ; AVX512F: cost of 4 {{.*}} sitofp
-  %1 = sitofp <2 x i32> %a to <2 x float>
-  ret <2 x float> %1
-}
-
-define <4 x float> @sitofpv4i32v4float(<4 x i32> %a) {
-  ; SSE2-LABEL: sitofpv4i32v4float
-  ; SSE2: cost of 5 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv4i32v4float
-  ; AVX1: cost of 1 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv4i32v4float
-  ; AVX2: cost of 1 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv4i32v4float
-  ; AVX512F: cost of 1 {{.*}} sitofp
-  %1 = sitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %1
-}
-
-define <8 x float> @sitofpv8i32v8float(<8 x i32> %a) {
-  ; SSE2-LABEL: sitofpv8i32v8float
-  ; SSE2: cost of 10 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv8i32v8float
-  ; AVX1: cost of 1 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv8i32v8float
-  ; AVX2: cost of 1 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv8i32v8float
-  ; AVX512F: cost of 1 {{.*}} sitofp
-  %1 = sitofp <8 x i32> %a to <8 x float>
-  ret <8 x float> %1
-}
-
-define <16 x float> @sitofpv16i32v16float(<16 x i32> %a) {
-  ; SSE2-LABEL: sitofpv16i32v16float
-  ; SSE2: cost of 20 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv16i32v16float
-  ; AVX1: cost of 3 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv16i32v16float
-  ; AVX2: cost of 3 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv16i32v16float
-  ; AVX512F: cost of 1 {{.*}} sitofp
-  %1 = sitofp <16 x i32> %a to <16 x float>
-  ret <16 x float> %1
-}
-
-define <32 x float> @sitofpv32i32v32float(<32 x i32> %a) {
-  ; SSE2-LABEL: sitofpv32i32v32float
-  ; SSE2: cost of 40 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv32i32v32float
-  ; AVX1: cost of 7 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv32i32v32float
-  ; AVX2: cost of 7 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv32i32v32float
-  ; AVX512F: cost of 3 {{.*}} sitofp
-  %1 = sitofp <32 x i32> %a to <32 x float>
-  ret <32 x float> %1
-}
-
-define <2 x float> @sitofpv2i64v2float(<2 x i64> %a) {
-  ; SSE2-LABEL: sitofpv2i64v2float
-  ; SSE2: cost of 15 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv2i64v2float
-  ; AVX1: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv2i64v2float
-  ; AVX2: cost of 4 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv2i64v2float
-  ; AVX512F: cost of 4 {{.*}} sitofp
-  %1 = sitofp <2 x i64> %a to <2 x float>
-  ret <2 x float> %1
-}
-
-define <4 x float> @sitofpv4i64v4float(<4 x i64> %a) {
-  ; SSE2-LABEL: sitofpv4i64v4float
-  ; SSE2: cost of 30 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv4i64v4float
-  ; AVX1: cost of 10 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv4i64v4float
-  ; AVX2: cost of 10 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv4i64v4float
-  ; AVX512F: cost of 10 {{.*}} sitofp
-  %1 = sitofp <4 x i64> %a to <4 x float>
-  ret <4 x float> %1
-}
-
-define <8 x float> @sitofpv8i64v8float(<8 x i64> %a) {
-  ; SSE2-LABEL: sitofpv8i64v8float
-  ; SSE2: cost of 60 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv8i64v8float
-  ; AVX1: cost of 21 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv8i64v8float
-  ; AVX2: cost of 21 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv8i64v8float
-  ; AVX512F: cost of 22 {{.*}} sitofp
-  %1 = sitofp <8 x i64> %a to <8 x float>
-  ret <8 x float> %1
-}
-
-define <16 x float> @sitofpv16i64v16float(<16 x i64> %a) {
-  ; SSE2-LABEL: sitofpv16i64v16float
-  ; SSE2: cost of 120 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv16i64v16float
-  ; AVX1: cost of 43 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv16i64v16float
-  ; AVX2: cost of 43 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv16i64v16float
-  ; AVX512F: cost of 45 {{.*}} sitofp
-  %1 = sitofp <16 x i64> %a to <16 x float>
-  ret <16 x float> %1
-}
-
-define <32 x float> @sitofpv32i64v32float(<32 x i64> %a) {
-  ; SSE2-LABEL: sitofpv32i64v32float
-  ; SSE2: cost of 240 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv32i64v32float
-  ; AVX1: cost of 87 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv32i64v32float
-  ; AVX2: cost of 87 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv32i64v32float
-  ; AVX512F: cost of 91 {{.*}} sitofp
-  %1 = sitofp <32 x i64> %a to <32 x float>
-  ret <32 x float> %1
-}
-
-define <8 x double> @sitofpv8i1v8double(<8 x double> %a) {
-  ; SSE2-LABEL: sitofpv8i1v8double
-  ; SSE2: cost of 80 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv8i1v8double
-  ; AVX1: cost of 7 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv8i1v8double
-  ; AVX2: cost of 7 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv8i1v8double
-  ; AVX512F: cost of 4 {{.*}} sitofp
-  %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
-  %1 = sitofp <8 x i1> %cmpres to <8 x double>
-  ret <8 x double> %1
-}
-
-define <16 x float> @sitofpv16i1v16float(<16 x float> %a) {
-  ; SSE2-LABEL: sitofpv16i1v16float
-  ; SSE2: cost of 8 {{.*}} sitofp
-  ;
-  ; AVX1-LABEL: sitofpv16i1v16float
-  ; AVX1: cost of 17 {{.*}} sitofp
-  ;
-  ; AVX2-LABEL: sitofpv16i1v16float
-  ; AVX2: cost of 17 {{.*}} sitofp
-  ;
-  ; AVX512F-LABEL: sitofpv16i1v16float
-  ; AVX512F: cost of 3 {{.*}} sitofp
-  %cmpres = fcmp ogt <16 x float> %a, zeroinitializer
-  %1 = sitofp <16 x i1> %cmpres to <16 x float>
-  ret <16 x float> %1
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f -cost-model -analyze < %s | FileCheck --check-prefix=AVX512 --check-prefix=AVX512F %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512dq -cost-model -analyze < %s | FileCheck --check-prefix=AVX512 --check-prefix=AVX512DQ %s
+
+; CHECK-LABEL: 'sitofp_i8_double'
+define i32 @sitofp_i8_double() {
+  ; SSE2: cost of 1 {{.*}} sitofp i8
+  ; AVX1: cost of 1 {{.*}} sitofp i8
+  ; AVX2: cost of 1 {{.*}} sitofp i8
+  ; AVX512: cost of 1 {{.*}} sitofp i8
+  %cvt_i8_f64 = sitofp i8 undef to double
+
+  ; SSE2: cost of 20 {{.*}} sitofp <2 x i8>
+  ; AVX1: cost of 4 {{.*}} sitofp <2 x i8>
+  ; AVX2: cost of 4 {{.*}} sitofp <2 x i8>
+  ; AVX512: cost of 4 {{.*}} sitofp <2 x i8>
+  %cvt_v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
+
+  ; SSE2: cost of 40 {{.*}} sitofp <4 x i8>
+  ; AVX1: cost of 3 {{.*}} sitofp <4 x i8>
+  ; AVX2: cost of 3 {{.*}} sitofp <4 x i8>
+  ; AVX512: cost of 3 {{.*}} sitofp <4 x i8>
+  %cvt_v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
+
+  ; SSE2: cost of 80 {{.*}} sitofp <8 x i8>
+  ; AVX1: cost of 7 {{.*}} sitofp <8 x i8>
+  ; AVX2: cost of 7 {{.*}} sitofp <8 x i8>
+  ; AVX512: cost of 2 {{.*}} sitofp <8 x i8>
+  %cvt_v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'sitofp_i16_double'
+define i32 @sitofp_i16_double() {
+  ; SSE2: cost of 1 {{.*}} sitofp i16
+  ; AVX1: cost of 1 {{.*}} sitofp i16
+  ; AVX2: cost of 1 {{.*}} sitofp i16
+  ; AVX512: cost of 1 {{.*}} sitofp i16
+  %cvt_i16_f64 = sitofp i16 undef to double
+
+  ; SSE2: cost of 20 {{.*}} sitofp <2 x i16>
+  ; AVX1: cost of 4 {{.*}} sitofp <2 x i16>
+  ; AVX2: cost of 4 {{.*}} sitofp <2 x i16>
+  ; AVX512: cost of 4 {{.*}} sitofp <2 x i16>
+  %cvt_v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
+
+  ; SSE2: cost of 40 {{.*}} sitofp <4 x i16>
+  ; AVX1: cost of 3 {{.*}} sitofp <4 x i16>
+  ; AVX2: cost of 3 {{.*}} sitofp <4 x i16>
+  ; AVX512: cost of 3 {{.*}} sitofp <4 x i16>
+  %cvt_v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
+
+  ; SSE2: cost of 80 {{.*}} sitofp <8 x i16>
+  ; AVX1: cost of 7 {{.*}} sitofp <8 x i16>
+  ; AVX2: cost of 7 {{.*}} sitofp <8 x i16>
+  ; AVX512: cost of 2 {{.*}} sitofp <8 x i16>
+  %cvt_v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'sitofp_i32_double'
+define i32 @sitofp_i32_double() {
+  ; SSE2: cost of 1 {{.*}} sitofp i32
+  ; AVX1: cost of 1 {{.*}} sitofp i32
+  ; AVX2: cost of 1 {{.*}} sitofp i32
+  ; AVX512: cost of 1 {{.*}} sitofp i32
+  %cvt_i32_f64 = sitofp i32 undef to double
+
+  ; SSE2: cost of 20 {{.*}} sitofp <2 x i32>
+  ; AVX1: cost of 4 {{.*}} sitofp <2 x i32>
+  ; AVX2: cost of 4 {{.*}} sitofp <2 x i32>
+  ; AVX512: cost of 4 {{.*}} sitofp <2 x i32>
+  %cvt_v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+
+  ; SSE2: cost of 40 {{.*}} sitofp <4 x i32>
+  ; AVX1: cost of 1 {{.*}} sitofp <4 x i32>
+  ; AVX2: cost of 1 {{.*}} sitofp <4 x i32>
+  ; AVX512: cost of 1 {{.*}} sitofp <4 x i32>
+  %cvt_v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+
+  ; SSE2: cost of 80 {{.*}} sitofp <8 x i32>
+  ; AVX1: cost of 3 {{.*}} sitofp <8 x i32>
+  ; AVX2: cost of 3 {{.*}} sitofp <8 x i32>
+  ; AVX512: cost of 1 {{.*}} sitofp <8 x i32>
+  %cvt_v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'sitofp_i64_double'
+define i32 @sitofp_i64_double() {
+  ; SSE2: cost of 1 {{.*}} sitofp i64
+  ; AVX1: cost of 1 {{.*}} sitofp i64
+  ; AVX2: cost of 1 {{.*}} sitofp i64
+  ; AVX512: cost of 1 {{.*}} sitofp i64
+  %cvt_i64_f64 = sitofp i64 undef to double
+
+  ; SSE2: cost of 20 {{.*}} sitofp <2 x i64>
+  ; AVX1: cost of 20 {{.*}} sitofp <2 x i64>
+  ; AVX2: cost of 20 {{.*}} sitofp <2 x i64>
+  ; AVX512F: cost of 20 {{.*}} sitofp <2 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} sitofp <2 x i64>
+  %cvt_v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+
+  ; SSE2: cost of 40 {{.*}} sitofp <4 x i64>
+  ; AVX1: cost of 13 {{.*}} sitofp <4 x i64>
+  ; AVX2: cost of 13 {{.*}} sitofp <4 x i64>
+  ; AVX512F: cost of 13 {{.*}} sitofp <4 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} sitofp <4 x i64>
+  %cvt_v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
+
+  ; SSE2: cost of 80 {{.*}} sitofp <8 x i64>
+  ; AVX1: cost of 27 {{.*}} sitofp <8 x i64>
+  ; AVX2: cost of 27 {{.*}} sitofp <8 x i64>
+  ; AVX512F: cost of 22 {{.*}} sitofp <8 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} sitofp <8 x i64>
+  %cvt_v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'sitofp_i8_float'
+define i32 @sitofp_i8_float() {
+  ; SSE2: cost of 1 {{.*}} sitofp i8
+  ; AVX1: cost of 1 {{.*}} sitofp i8
+  ; AVX2: cost of 1 {{.*}} sitofp i8
+  ; AVX512: cost of 1 {{.*}} sitofp i8
+  %cvt_i8_f32 = sitofp i8 undef to float
+
+  ; SSE2: cost of 5 {{.*}} sitofp <4 x i8>
+  ; AVX1: cost of 3 {{.*}} sitofp <4 x i8>
+  ; AVX2: cost of 3 {{.*}} sitofp <4 x i8>
+  ; AVX512: cost of 3 {{.*}} sitofp <4 x i8>
+  %cvt_v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
+
+  ; SSE2: cost of 15 {{.*}} sitofp <8 x i8>
+  ; AVX1: cost of 8 {{.*}} sitofp <8 x i8>
+  ; AVX2: cost of 8 {{.*}} sitofp <8 x i8>
+  ; AVX512: cost of 8 {{.*}} sitofp <8 x i8>
+  %cvt_v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
+
+  ; SSE2: cost of 8 {{.*}} sitofp <16 x i8>
+  ; AVX1: cost of 17 {{.*}} sitofp <16 x i8>
+  ; AVX16: cost of 17 {{.*}} sitofp <16 x i8>
+  ; AVX512: cost of 2 {{.*}} sitofp <16 x i8>
+  %cvt_v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'sitofp_i16_float'
+define i32 @sitofp_i16_float() {
+  ; SSE2: cost of 1 {{.*}} sitofp i16
+  ; AVX1: cost of 1 {{.*}} sitofp i16
+  ; AVX2: cost of 1 {{.*}} sitofp i16
+  ; AVX512: cost of 1 {{.*}} sitofp i16
+  %cvt_i16_f32 = sitofp i16 undef to float
+
+  ; SSE2: cost of 5 {{.*}} sitofp <4 x i16>
+  ; AVX1: cost of 3 {{.*}} sitofp <4 x i16>
+  ; AVX2: cost of 3 {{.*}} sitofp <4 x i16>
+  ; AVX512: cost of 3 {{.*}} sitofp <4 x i16>
+  %cvt_v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
+
+  ; SSE2: cost of 15 {{.*}} sitofp <8 x i16>
+  ; AVX1: cost of 5 {{.*}} sitofp <8 x i16>
+  ; AVX2: cost of 5 {{.*}} sitofp <8 x i16>
+  ; AVX512: cost of 5 {{.*}} sitofp <8 x i16>
+  %cvt_v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
+
+  ; SSE2: cost of 30 {{.*}} sitofp <16 x i16>
+  ; AVX1: cost of 11 {{.*}} sitofp <16 x i16>
+  ; AVX16: cost of 11 {{.*}} sitofp <16 x i16>
+  ; AVX512: cost of 2 {{.*}} sitofp <16 x i16>
+  %cvt_v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'sitofp_i32_float'
+define i32 @sitofp_i32_float() {
+  ; SSE2: cost of 1 {{.*}} sitofp i32
+  ; AVX1: cost of 1 {{.*}} sitofp i32
+  ; AVX2: cost of 1 {{.*}} sitofp i32
+  ; AVX512: cost of 1 {{.*}} sitofp i32
+  %cvt_i32_f32 = sitofp i32 undef to float
+
+  ; SSE2: cost of 5 {{.*}} sitofp <4 x i32>
+  ; AVX1: cost of 1 {{.*}} sitofp <4 x i32>
+  ; AVX2: cost of 1 {{.*}} sitofp <4 x i32>
+  ; AVX512: cost of 1 {{.*}} sitofp <4 x i32>
+  %cvt_v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+
+  ; SSE2: cost of 10 {{.*}} sitofp <8 x i32>
+  ; AVX1: cost of 1 {{.*}} sitofp <8 x i32>
+  ; AVX2: cost of 1 {{.*}} sitofp <8 x i32>
+  ; AVX512: cost of 1 {{.*}} sitofp <8 x i32>
+  %cvt_v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+
+  ; SSE2: cost of 20 {{.*}} sitofp <16 x i32>
+  ; AVX1: cost of 3 {{.*}} sitofp <16 x i32>
+  ; AVX2: cost of 3 {{.*}} sitofp <16 x i32>
+  ; AVX512: cost of 1 {{.*}} sitofp <16 x i32>
+  %cvt_v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'sitofp_i64_float'
+define i32 @sitofp_i64_float() {
+  ; SSE2: cost of 1 {{.*}} sitofp i64
+  ; AVX1: cost of 1 {{.*}} sitofp i64
+  ; AVX2: cost of 1 {{.*}} sitofp i64
+  ; AVX512: cost of 1 {{.*}} sitofp i64
+  %cvt_i64_f32 = sitofp i64 undef to float
+
+  ; SSE2: cost of 15 {{.*}} sitofp <2 x i64>
+  ; AVX1: cost of 4 {{.*}} sitofp <2 x i64>
+  ; AVX2: cost of 4 {{.*}} sitofp <2 x i64>
+  ; AVX512F: cost of 4 {{.*}} sitofp <2 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} sitofp <2 x i64>
+  %cvt_v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
+
+  ; SSE2: cost of 30 {{.*}} sitofp <4 x i64>
+  ; AVX1: cost of 10 {{.*}} sitofp <4 x i64>
+  ; AVX2: cost of 10 {{.*}} sitofp <4 x i64>
+  ; AVX512F: cost of 10 {{.*}} sitofp <4 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} sitofp <4 x i64>
+  %cvt_v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
+
+  ; SSE2: cost of 60 {{.*}} sitofp <8 x i64>
+  ; AVX1: cost of 21 {{.*}} sitofp <8 x i64>
+  ; AVX2: cost of 21 {{.*}} sitofp <8 x i64>
+  ; AVX512F: cost of 22 {{.*}} sitofp <8 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} sitofp <8 x i64>
+  %cvt_v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+
+  ; SSE2: cost of 120 {{.*}} sitofp <16 x i64>
+  ; AVX1: cost of 43 {{.*}} sitofp <16 x i64>
+  ; AVX2: cost of 43 {{.*}} sitofp <16 x i64>
+  ; AVX512F: cost of 45 {{.*}} sitofp <16 x i64>
+  ; AVX512DQ: cost of 3 {{.*}} sitofp <16 x i64>
+  %cvt_v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+
+  ret i32 undef
 }
diff --git a/test/Analysis/CostModel/X86/strided-load-i16.ll b/test/Analysis/CostModel/X86/strided-load-i16.ll
new file mode 100755
index 000000000000..2c2cf3938bcb
--- /dev/null
+++ b/test/Analysis/CostModel/X86/strided-load-i16.ll
@@ -0,0 +1,113 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i16] zeroinitializer, align 16
+@B = global [10240 x i16] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i16_stride2() {
+;CHECK-LABEL: load_i16_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i16_stride3() {
+;CHECK-LABEL: load_i16_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i16_stride4() {
+;CHECK-LABEL: load_i16_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 5 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i16_stride5() {
+;CHECK-LABEL: load_i16_stride5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 6 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 5
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/strided-load-i32.ll b/test/Analysis/CostModel/X86/strided-load-i32.ll
new file mode 100755
index 000000000000..0dcd3929da7f
--- /dev/null
+++ b/test/Analysis/CostModel/X86/strided-load-i32.ll
@@ -0,0 +1,110 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i32] zeroinitializer, align 16
+@B = global [10240 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_int_stride2() {
+;CHECK-LABEL: load_int_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_int_stride3() {
+;CHECK-LABEL: load_int_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_int_stride4() {
+;CHECK-LABEL: load_int_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 5 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_int_stride5() {
+;CHECK-LABEL: load_int_stride5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 6 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 5
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
diff --git a/test/Analysis/CostModel/X86/strided-load-i64.ll b/test/Analysis/CostModel/X86/strided-load-i64.ll
new file mode 100755
index 000000000000..0370b6f80efd
--- /dev/null
+++ b/test/Analysis/CostModel/X86/strided-load-i64.ll
@@ -0,0 +1,81 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i64] zeroinitializer, align 16
+@B = global [10240 x i64] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i64_stride2() {
+;CHECK-LABEL: load_i64_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
+  %1 = load i64, i64* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %1, i64* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i64_stride3() {
+;CHECK-LABEL: load_i64_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
+  %1 = load i64, i64* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %1, i64* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i64_stride4() {
+;CHECK-LABEL: load_i64_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 5 for VF 8 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 4
+  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
+  %1 = load i64, i64* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %1, i64* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/strided-load-i8.ll b/test/Analysis/CostModel/X86/strided-load-i8.ll
new file mode 100755
index 000000000000..2a3a83864151
--- /dev/null
+++ b/test/Analysis/CostModel/X86/strided-load-i8.ll
@@ -0,0 +1,117 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i8] zeroinitializer, align 16
+@B = global [10240 x i8] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i8_stride2() {
+;CHECK-LABEL: load_i8_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i8_stride3() {
+;CHECK-LABEL: load_i8_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 39 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i8_stride4() {
+;CHECK-LABEL: load_i8_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 59 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i8_stride5() {
+;CHECK-LABEL: load_i8_stride5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 39 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 78 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 5
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/CostModel/X86/trunc.ll b/test/Analysis/CostModel/X86/trunc.ll
new file mode 100644
index 000000000000..a270251c2b17
--- /dev/null
+++ b/test/Analysis/CostModel/X86/trunc.ll
@@ -0,0 +1,141 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE2
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSSE3
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE42
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK-LABEL: 'trunc_vXi32'
+define i32 @trunc_vXi32() {
+  ; SSE: cost of 0 {{.*}} %V2i64 = trunc
+  ; AVX1: cost of 0 {{.*}} %V2i64 = trunc
+  ; AVX2: cost of 0 {{.*}} %V2i64 = trunc
+  ; AVX512: cost of 0 {{.*}} %V2i64 = trunc
+  %V2i64 = trunc <2 x i64> undef to <2 x i32>
+
+  ; SSE: cost of 1 {{.*}} %V4i64 = trunc
+  ; AVX1: cost of 4 {{.*}} %V4i64 = trunc
+  ; AVX2: cost of 2 {{.*}} %V4i64 = trunc
+  ; AVX512: cost of 2 {{.*}} %V4i64 = trunc
+  %V4i64 = trunc <4 x i64> undef to <4 x i32>
+
+  ; SSE: cost of 3 {{.*}} %V8i64 = trunc
+  ; AVX1: cost of 9 {{.*}} %V8i64 = trunc
+  ; AVX2: cost of 4 {{.*}} %V8i64 = trunc
+  ; AVX512: cost of 1 {{.*}} %V8i64 = trunc
+  %V8i64 = trunc <8 x i64> undef to <8 x i32>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'trunc_vXi16'
+define i32 @trunc_vXi16() {
+  ; SSE: cost of 0 {{.*}} %V2i64 = trunc
+  ; AVX: cost of 0 {{.*}} %V2i64 = trunc
+  %V2i64 = trunc <2 x i64> undef to <2 x i16>
+
+  ; SSE: cost of 1 {{.*}} %V4i64 = trunc
+  ; AVX1: cost of 4 {{.*}} %V4i64 = trunc
+  ; AVX2: cost of 2 {{.*}} %V4i64 = trunc
+  ; AVX512: cost of 2 {{.*}} %V4i64 = trunc
+  %V4i64 = trunc <4 x i64> undef to <4 x i16>
+
+  ; SSE: cost of 3 {{.*}} %V8i64 = trunc
+  ; AVX: cost of 0 {{.*}} %V8i64 = trunc
+  %V8i64 = trunc <8 x i64> undef to <8 x i16>
+
+  ; SSE2: cost of 3 {{.*}} %V4i32 = trunc
+  ; SSSE3: cost of 3 {{.*}} %V4i32 = trunc
+  ; SSE42: cost of 1 {{.*}} %V4i32 = trunc
+  ; AVX1: cost of 1 {{.*}} %V4i32 = trunc
+  ; AVX2: cost of 1 {{.*}} %V4i32 = trunc
+  ; AVX512: cost of 1 {{.*}} %V4i32 = trunc
+  %V4i32 = trunc <4 x i32> undef to <4 x i16>
+
+  ; SSE2: cost of 5 {{.*}} %V8i32 = trunc
+  ; SSSE3: cost of 5 {{.*}} %V8i32 = trunc
+  ; SSE42: cost of 3 {{.*}} %V8i32 = trunc
+  ; AVX1: cost of 5 {{.*}} %V8i32 = trunc
+  ; AVX2: cost of 2 {{.*}} %V8i32 = trunc
+  ; AVX512: cost of 2 {{.*}} %V8i32 = trunc
+  %V8i32 = trunc <8 x i32> undef to <8 x i16>
+
+  ; SSE2: cost of 10 {{.*}} %V16i32 = trunc
+  ; SSSE3: cost of 10 {{.*}} %V16i32 = trunc
+  ; SSE42: cost of 6 {{.*}} %V16i32 = trunc
+  ; AVX1: cost of 6 {{.*}} %V16i32 = trunc
+  ; AVX2: cost of 6 {{.*}} %V16i32 = trunc
+  ; AVX512: cost of 1 {{.*}} %V16i32 = trunc
+  %V16i32 = trunc <16 x i32> undef to <16 x i16>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'trunc_vXi8'
+define i32 @trunc_vXi8() {
+  ; SSE: cost of 0 {{.*}} %V2i64 = trunc
+  ; AVX: cost of 0 {{.*}} %V2i64 = trunc
+  %V2i64 = trunc <2 x i64> undef to <2 x i8>
+
+  ; SSE: cost of 1 {{.*}} %V4i64 = trunc
+  ; AVX1: cost of 4 {{.*}} %V4i64 = trunc
+  ; AVX2: cost of 2 {{.*}} %V4i64 = trunc
+  ; AVX512: cost of 2 {{.*}} %V4i64 = trunc
+  %V4i64 = trunc <4 x i64> undef to <4 x i8>
+
+  ; SSE: cost of 3 {{.*}} %V8i64 = trunc
+  ; AVX: cost of 0 {{.*}} %V8i64 = trunc
+  %V8i64 = trunc <8 x i64> undef to <8 x i8>
+
+  ; SSE: cost of 0 {{.*}} %V2i32 = trunc
+  ; AVX: cost of 0 {{.*}} %V2i32 = trunc
+  %V2i32 = trunc <2 x i32> undef to <2 x i8>
+
+  ; SSE2: cost of 3 {{.*}} %V4i32 = trunc
+  ; SSSE3: cost of 3 {{.*}} %V4i32 = trunc
+  ; SSE42: cost of 1 {{.*}} %V4i32 = trunc
+  ; AVX: cost of 1 {{.*}} %V4i32 = trunc
+  %V4i32 = trunc <4 x i32> undef to <4 x i8>
+
+  ; SSE2: cost of 4 {{.*}} %V8i32 = trunc
+  ; SSSE3: cost of 4 {{.*}} %V8i32 = trunc
+  ; SSE42: cost of 3 {{.*}} %V8i32 = trunc
+  ; AVX1: cost of 4 {{.*}} %V8i32 = trunc
+  ; AVX2: cost of 2 {{.*}} %V8i32 = trunc
+  ; AVX512: cost of 2 {{.*}} %V8i32 = trunc
+  %V8i32 = trunc <8 x i32> undef to <8 x i8>
+
+  ; SSE: cost of 7 {{.*}} %V16i32 = trunc
+  ; AVX: cost of 7 {{.*}} %V16i32 = trunc
+  %V16i32 = trunc <16 x i32> undef to <16 x i8>
+
+  ; SSE: cost of 0 {{.*}} %V2i16 = trunc
+  ; AVX: cost of 0 {{.*}} %V2i16 = trunc
+  %V2i16 = trunc <2 x i16> undef to <2 x i8>
+
+  ; SSE2: cost of 4 {{.*}} %V4i16 = trunc
+  ; SSSE3: cost of 4 {{.*}} %V4i16 = trunc
+  ; SSE42: cost of 2 {{.*}} %V4i16 = trunc
+  ; AVX: cost of 2 {{.*}} %V4i16 = trunc
+  %V4i16 = trunc <4 x i16> undef to <4 x i8>
+
+  ; SSE2: cost of 2 {{.*}} %V8i16 = trunc
+  ; SSSE3: cost of 2 {{.*}} %V8i16 = trunc
+  ; SSE42: cost of 1 {{.*}} %V8i16 = trunc
+  ; AVX: cost of 1 {{.*}} %V8i16 = trunc
+  %V8i16 = trunc <8 x i16> undef to <8 x i8>
+
+  ; SSE: cost of 3 {{.*}} %V16i16 = trunc
+  ; AVX: cost of 4 {{.*}} %V16i16 = trunc
+  %V16i16 = trunc <16 x i16> undef to <16 x i8>
+
+  ; SSE: cost of 7 {{.*}} %V32i16 = trunc
+  ; AVX: cost of 9 {{.*}} %V32i16 = trunc
+  %V32i16 = trunc <32 x i16> undef to <32 x i8>
+
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/uitofp.ll b/test/Analysis/CostModel/X86/uitofp.ll
index 2eb8407974f7..a0b48c0b4501 100644
--- a/test/Analysis/CostModel/X86/uitofp.ll
+++ b/test/Analysis/CostModel/X86/uitofp.ll
@@ -1,709 +1,250 @@
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx  -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX1 %s
 ; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx2 -cost-model -analyze < %s | FileCheck --check-prefix=AVX --check-prefix=AVX2 %s
-; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f -cost-model -analyze < %s | FileCheck --check-prefix=AVX512F %s
-; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512dq -cost-model -analyze < %s | FileCheck --check-prefix=AVX512DQ %s
-
-define <2 x double> @uitofpv2i8v2double(<2 x i8> %a) {
-  ; SSE2-LABEL: uitofpv2i8v2double
-  ; SSE2: cost of 20 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv2i8v2double
-  ; AVX1: cost of 4 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv2i8v2double
-  ; AVX2: cost of 4 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv2i8v2double
-  ; AVX512F: cost of 2 {{.*}} uitofp
-  %1 = uitofp <2 x i8> %a to <2 x double>
-  ret <2 x double> %1
-}
-
-define <4 x double> @uitofpv4i8v4double(<4 x i8> %a) {
-  ; SSE2-LABEL: uitofpv4i8v4double
-  ; SSE2: cost of 40 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv4i8v4double
-  ; AVX1: cost of 2 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv4i8v4double
-  ; AVX2: cost of 2 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv4i8v4double
-  ; AVX512F: cost of 2 {{.*}} uitofp
-  %1 = uitofp <4 x i8> %a to <4 x double>
-  ret <4 x double> %1
-}
-
-define <8 x double> @uitofpv8i8v8double(<8 x i8> %a) {
-  ; SSE2-LABEL: uitofpv8i8v8double
-  ; SSE2: cost of 80 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv8i8v8double
-  ; AVX1: cost of 5 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv8i8v8double
-  ; AVX2: cost of 5 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv8i8v8double
-  ; AVX512F: cost of 2 {{.*}} uitofp
-  %1 = uitofp <8 x i8> %a to <8 x double>
-  ret <8 x double> %1
-}
-
-define <16 x double> @uitofpv16i8v16double(<16 x i8> %a) {
-  ; SSE2-LABEL: uitofpv16i8v16double
-  ; SSE2: cost of 160 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv16i8v16double
-  ; AVX1: cost of 11 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv16i8v16double
-  ; AVX2: cost of 11 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv16i8v16double
-  ; AVX512F: cost of 5 {{.*}} uitofp
-  %1 = uitofp <16 x i8> %a to <16 x double>
-  ret <16 x double> %1
-}
-
-define <32 x double> @uitofpv32i8v32double(<32 x i8> %a) {
-  ; SSE2-LABEL: uitofpv32i8v32double
-  ; SSE2: cost of 320 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv32i8v32double
-  ; AVX1: cost of 23 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv32i8v32double
-  ; AVX2: cost of 23 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv32i8v32double
-  ; AVX512F: cost of 11 {{.*}} uitofp
-  %1 = uitofp <32 x i8> %a to <32 x double>
-  ret <32 x double> %1
-}
-
-define <2 x double> @uitofpv2i16v2double(<2 x i16> %a) {
-  ; SSE2-LABEL: uitofpv2i16v2double
-  ; SSE2: cost of 20 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv2i16v2double
-  ; AVX1: cost of 4 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv2i16v2double
-  ; AVX2: cost of 4 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv2i16v2double
-  ; AVX512F: cost of 5 {{.*}} uitofp
-  %1 = uitofp <2 x i16> %a to <2 x double>
-  ret <2 x double> %1
-}
-
-define <4 x double> @uitofpv4i16v4double(<4 x i16> %a) {
-  ; SSE2-LABEL: uitofpv4i16v4double
-  ; SSE2: cost of 40 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv4i16v4double
-  ; AVX1: cost of 2 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv4i16v4double
-  ; AVX2: cost of 2 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv4i16v4double
-  ; AVX512F: cost of 2 {{.*}} uitofp
-  %1 = uitofp <4 x i16> %a to <4 x double>
-  ret <4 x double> %1
-}
-
-define <8 x double> @uitofpv8i16v8double(<8 x i16> %a) {
-  ; SSE2-LABEL: uitofpv8i16v8double
-  ; SSE2: cost of 80 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv8i16v8double
-  ; AVX1: cost of 5 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv8i16v8double
-  ; AVX2: cost of 5 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv8i16v8double
-  ; AVX512F: cost of 2 {{.*}} uitofp
-  %1 = uitofp <8 x i16> %a to <8 x double>
-  ret <8 x double> %1
-}
-
-define <16 x double> @uitofpv16i16v16double(<16 x i16> %a) {
-  ; SSE2-LABEL: uitofpv16i16v16double
-  ; SSE2: cost of 160 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv16i16v16double
-  ; AVX1: cost of 11 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv16i16v16double
-  ; AVX2: cost of 11 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv16i16v16double
-  ; AVX512F: cost of 5 {{.*}} uitofp
-  %1 = uitofp <16 x i16> %a to <16 x double>
-  ret <16 x double> %1
-}
-
-define <32 x double> @uitofpv32i16v32double(<32 x i16> %a) {
-  ; SSE2-LABEL: uitofpv32i16v32double
-  ; SSE2: cost of 320 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv32i16v32double
-  ; AVX1: cost of 23 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv32i16v32double
-  ; AVX2: cost of 23 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv32i16v32double
-  ; AVX512F: cost of 11 {{.*}} uitofp
-  %1 = uitofp <32 x i16> %a to <32 x double>
-  ret <32 x double> %1
-}
-
-define <2 x double> @uitofpv2i32v2double(<2 x i32> %a) {
-  ; SSE2-LABEL: uitofpv2i32v2double
-  ; SSE2: cost of 20 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv2i32v2double
-  ; AVX1: cost of 6 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv2i32v2double
-  ; AVX2: cost of 6 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv2i32v2double
-  ; AVX512F: cost of 1 {{.*}} uitofp
-  %1 = uitofp <2 x i32> %a to <2 x double>
-  ret <2 x double> %1
-}
-
-define <4 x double> @uitofpv4i32v4double(<4 x i32> %a) {
-  ; SSE2-LABEL: uitofpv4i32v4double
-  ; SSE2: cost of 40 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv4i32v4double
-  ; AVX1: cost of 6 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv4i32v4double
-  ; AVX2: cost of 6 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv4i32v4double
-  ; AVX512F: cost of 1 {{.*}} uitofp
-  %1 = uitofp <4 x i32> %a to <4 x double>
-  ret <4 x double> %1
-}
-
-define <8 x double> @uitofpv8i32v8double(<8 x i32> %a) {
-  ; SSE2-LABEL: uitofpv8i32v8double
-  ; SSE2: cost of 80 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv8i32v8double
-  ; AVX1: cost of 13 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv8i32v8double
-  ; AVX2: cost of 13 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv8i32v8double
-  ; AVX512F: cost of 1 {{.*}} uitofp
-  %1 = uitofp <8 x i32> %a to <8 x double>
-  ret <8 x double> %1
-}
-
-define <16 x double> @uitofpv16i32v16double(<16 x i32> %a) {
-  ; SSE2-LABEL: uitofpv16i32v16double
-  ; SSE2: cost of 160 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv16i32v16double
-  ; AVX1: cost of 27 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv16i32v16double
-  ; AVX2: cost of 27 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv16i32v16double
-  ; AVX512F: cost of 3 {{.*}} uitofp
-  %1 = uitofp <16 x i32> %a to <16 x double>
-  ret <16 x double> %1
-}
-
-define <32 x double> @uitofpv32i32v32double(<32 x i32> %a) {
-  ; SSE2-LABEL: uitofpv32i32v32double
-  ; SSE2: cost of 320 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv32i32v32double
-  ; AVX1: cost of 55 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv32i32v32double
-  ; AVX2: cost of 55 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv32i32v32double
-  ; AVX512F: cost of 7 {{.*}} uitofp
-  %1 = uitofp <32 x i32> %a to <32 x double>
-  ret <32 x double> %1
-}
-
-define <2 x double> @uitofpv2i64v2double(<2 x i64> %a) {
-  ; SSE2-LABEL: uitofpv2i64v2double
-  ; SSE2: cost of 20 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv2i64v2double
-  ; AVX1: cost of 10 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv2i64v2double
-  ; AVX2: cost of 10 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv2i64v2double
-  ; AVX512F: cost of 5 {{.*}} uitofp
-  ;
-  ; AVX512DQ-LABEL: uitofpv2i64v2double
-  ; AVX512DQ: cost of 1 {{.*}} uitofp
-  %1 = uitofp <2 x i64> %a to <2 x double>
-  ret <2 x double> %1
-}
-
-define <4 x double> @uitofpv4i64v4double(<4 x i64> %a) {
-  ; SSE2-LABEL: uitofpv4i64v4double
-  ; SSE2: cost of 40 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv4i64v4double
-  ; AVX1: cost of 20 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv4i64v4double
-  ; AVX2: cost of 20 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv4i64v4double
-  ; AVX512F: cost of 12 {{.*}} uitofp
-  ;
-  ; AVX512DQ-LABEL: uitofpv4i64v4double
-  ; AVX512DQ: cost of 1 {{.*}} uitofp
-  %1 = uitofp <4 x i64> %a to <4 x double>
-  ret <4 x double> %1
-}
-
-define <8 x double> @uitofpv8i64v8double(<8 x i64> %a) {
-  ; SSE2-LABEL: uitofpv8i64v8double
-  ; SSE2: cost of 80 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv8i64v8double
-  ; AVX1: cost of 41 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv8i64v8double
-  ; AVX2: cost of 41 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv8i64v8double
-  ; AVX512F: cost of 26 {{.*}} uitofp
-  ;
-  ; AVX512DQ-LABEL: uitofpv8i64v8double
-  ; AVX512DQ: cost of 1 {{.*}} uitofp
-  %1 = uitofp <8 x i64> %a to <8 x double>
-  ret <8 x double> %1
-}
-
-define <16 x double> @uitofpv16i64v16double(<16 x i64> %a) {
-  ; SSE2-LABEL: uitofpv16i64v16double
-  ; SSE2: cost of 160 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv16i64v16double
-  ; AVX1: cost of 83 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv16i64v16double
-  ; AVX2: cost of 83 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv16i64v16double
-  ; AVX512F: cost of 53 {{.*}} uitofp
-  ;
-  ; AVX512DQ-LABEL: uitofpv16i64v16double
-  ; AVX512DQ: cost of 3 {{.*}} uitofp
-  %1 = uitofp <16 x i64> %a to <16 x double>
-  ret <16 x double> %1
-}
-
-define <32 x double> @uitofpv32i64v32double(<32 x i64> %a) {
-  ; SSE2-LABEL: uitofpv32i64v32double
-  ; SSE2: cost of 320 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv32i64v32double
-  ; AVX1: cost of 167 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv32i64v32double
-  ; AVX2: cost of 167 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv32i64v32double
-  ; AVX512F: cost of 107 {{.*}} uitofp
-  ;
-  ; AVX512DQ-LABEL: uitofpv32i64v32double
-  ; AVX512DQ: cost of 2 {{.*}} uitofp
-  %1 = uitofp <32 x i64> %a to <32 x double>
-  ret <32 x double> %1
-}
-
-define <2 x float> @uitofpv2i8v2float(<2 x i8> %a) {
-  ; SSE2-LABEL: uitofpv2i8v2float
-  ; SSE2: cost of 15 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv2i8v2float
-  ; AVX1: cost of 4 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv2i8v2float
-  ; AVX2: cost of 4 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv2i8v2float
-  ; AVX512F: cost of 4 {{.*}} uitofp
-  %1 = uitofp <2 x i8> %a to <2 x float>
-  ret <2 x float> %1
-}
-
-define <4 x float> @uitofpv4i8v4float(<4 x i8> %a) {
-  ; SSE2-LABEL: uitofpv4i8v4float
-  ; SSE2: cost of 8 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv4i8v4float
-  ; AVX1: cost of 2 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv4i8v4float
-  ; AVX2: cost of 2 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv4i8v4float
-  ; AVX512F: cost of 2 {{.*}} uitofp
-  %1 = uitofp <4 x i8> %a to <4 x float>
-  ret <4 x float> %1
-}
-
-define <8 x float> @uitofpv8i8v8float(<8 x i8> %a) {
-  ; SSE2-LABEL: uitofpv8i8v8float
-  ; SSE2: cost of 15 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv8i8v8float
-  ; AVX1: cost of 5 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv8i8v8float
-  ; AVX2: cost of 5 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv8i8v8float
-  ; AVX512F: cost of 2 {{.*}} uitofp
-  %1 = uitofp <8 x i8> %a to <8 x float>
-  ret <8 x float> %1
-}
-
-define <16 x float> @uitofpv16i8v16float(<16 x i8> %a) {
-  ; SSE2-LABEL: uitofpv16i8v16float
-  ; SSE2: cost of 8 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv16i8v16float
-  ; AVX1: cost of 11 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv16i8v16float
-  ; AVX2: cost of 11 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv16i8v16float
-  ; AVX512F: cost of 2 {{.*}} uitofp
-  %1 = uitofp <16 x i8> %a to <16 x float>
-  ret <16 x float> %1
-}
-
-define <32 x float> @uitofpv32i8v32float(<32 x i8> %a) {
-  ; SSE2-LABEL: uitofpv32i8v32float
-  ; SSE2: cost of 16 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv32i8v32float
-  ; AVX1: cost of 23 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv32i8v32float
-  ; AVX2: cost of 23 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv32i8v32float
-  ; AVX512F: cost of 5 {{.*}} uitofp
-  %1 = uitofp <32 x i8> %a to <32 x float>
-  ret <32 x float> %1
-}
-
-define <2 x float> @uitofpv2i16v2float(<2 x i16> %a) {
-  ; SSE2-LABEL: uitofpv2i16v2float
-  ; SSE2: cost of 15 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv2i16v2float
-  ; AVX1: cost of 4 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv2i16v2float
-  ; AVX2: cost of 4 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv2i16v2float
-  ; AVX512F: cost of 4 {{.*}} uitofp
-  %1 = uitofp <2 x i16> %a to <2 x float>
-  ret <2 x float> %1
-}
-
-define <4 x float> @uitofpv4i16v4float(<4 x i16> %a) {
-  ; SSE2-LABEL: uitofpv4i16v4float
-  ; SSE2: cost of 8 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv4i16v4float
-  ; AVX1: cost of 2 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv4i16v4float
-  ; AVX2: cost of 2 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv4i16v4float
-  ; AVX512F: cost of 2 {{.*}} uitofp
-  %1 = uitofp <4 x i16> %a to <4 x float>
-  ret <4 x float> %1
-}
-
-define <8 x float> @uitofpv8i16v8float(<8 x i16> %a) {
-  ; SSE2-LABEL: uitofpv8i16v8float
-  ; SSE2: cost of 15 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv8i16v8float
-  ; AVX1: cost of 5 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv8i16v8float
-  ; AVX2: cost of 5 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv8i16v8float
-  ; AVX512F: cost of 2 {{.*}} uitofp
-  %1 = uitofp <8 x i16> %a to <8 x float>
-  ret <8 x float> %1
-}
-
-define <16 x float> @uitofpv16i16v16float(<16 x i16> %a) {
-  ; SSE2-LABEL: uitofpv16i16v16float
-  ; SSE2: cost of 30 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv16i16v16float
-  ; AVX1: cost of 11 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv16i16v16float
-  ; AVX2: cost of 11 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv16i16v16float
-  ; AVX512F: cost of 2 {{.*}} uitofp
-  %1 = uitofp <16 x i16> %a to <16 x float>
-  ret <16 x float> %1
-}
-
-define <32 x float> @uitofpv32i16v32float(<32 x i16> %a) {
-  ; SSE2-LABEL: uitofpv32i16v32float
-  ; SSE2: cost of 60 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv32i16v32float
-  ; AVX1: cost of 23 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv32i16v32float
-  ; AVX2: cost of 23 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv32i16v32float
-  ; AVX512F: cost of 5 {{.*}} uitofp
-  %1 = uitofp <32 x i16> %a to <32 x float>
-  ret <32 x float> %1
-}
-
-define <2 x float> @uitofpv2i32v2float(<2 x i32> %a) {
-  ; SSE2-LABEL: uitofpv2i32v2float
-  ; SSE2: cost of 15 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv2i32v2float
-  ; AVX1: cost of 4 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv2i32v2float
-  ; AVX2: cost of 4 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv2i32v2float
-  ; AVX512F: cost of 2 {{.*}} uitofp
-  %1 = uitofp <2 x i32> %a to <2 x float>
-  ret <2 x float> %1
-}
-
-define <4 x float> @uitofpv4i32v4float(<4 x i32> %a) {
-  ; SSE2-LABEL: uitofpv4i32v4float
-  ; SSE2: cost of 8 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv4i32v4float
-  ; AVX1: cost of 6 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv4i32v4float
-  ; AVX2: cost of 6 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv4i32v4float
-  ; AVX512F: cost of 1 {{.*}} uitofp
-  %1 = uitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %1
-}
-
-define <8 x float> @uitofpv8i32v8float(<8 x i32> %a) {
-  ; SSE2-LABEL: uitofpv8i32v8float
-  ; SSE2: cost of 16 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv8i32v8float
-  ; AVX1: cost of 9 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv8i32v8float
-  ; AVX2: cost of 8 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv8i32v8float
-  ; AVX512F: cost of 1 {{.*}} uitofp
-  %1 = uitofp <8 x i32> %a to <8 x float>
-  ret <8 x float> %1
-}
-
-define <16 x float> @uitofpv16i32v16float(<16 x i32> %a) {
-  ; SSE2-LABEL: uitofpv16i32v16float
-  ; SSE2: cost of 32 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv16i32v16float
-  ; AVX1: cost of 19 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv16i32v16float
-  ; AVX2: cost of 17 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv16i32v16float
-  ; AVX512F: cost of 1 {{.*}} uitofp
-  %1 = uitofp <16 x i32> %a to <16 x float>
-  ret <16 x float> %1
-}
-
-define <32 x float> @uitofpv32i32v32float(<32 x i32> %a) {
-  ; SSE2-LABEL: uitofpv32i32v32float
-  ; SSE2: cost of 64 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv32i32v32float
-  ; AVX1: cost of 39 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv32i32v32float
-  ; AVX2: cost of 35 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv32i32v32float
-  ; AVX512F: cost of 3 {{.*}} uitofp
-  %1 = uitofp <32 x i32> %a to <32 x float>
-  ret <32 x float> %1
-}
-
-define <2 x float> @uitofpv2i64v2float(<2 x i64> %a) {
-  ; SSE2-LABEL: uitofpv2i64v2float
-  ; SSE2: cost of 15 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv2i64v2float
-  ; AVX1: cost of 4 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv2i64v2float
-  ; AVX2: cost of 4 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv2i64v2float
-  ; AVX512F: cost of 5 {{.*}} uitofp
-  %1 = uitofp <2 x i64> %a to <2 x float>
-  ret <2 x float> %1
-}
-
-define <4 x float> @uitofpv4i64v4float(<4 x i64> %a) {
-  ; SSE2-LABEL: uitofpv4i64v4float
-  ; SSE2: cost of 30 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv4i64v4float
-  ; AVX1: cost of 10 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv4i64v4float
-  ; AVX2: cost of 10 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv4i64v4float
-  ; AVX512F: cost of 10 {{.*}} uitofp
-  %1 = uitofp <4 x i64> %a to <4 x float>
-  ret <4 x float> %1
-}
-
-define <8 x float> @uitofpv8i64v8float(<8 x i64> %a) {
-  ; SSE2-LABEL: uitofpv8i64v8float
-  ; SSE2: cost of 60 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv8i64v8float
-  ; AVX1: cost of 21 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv8i64v8float
-  ; AVX2: cost of 21 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv8i64v8float
-  ; AVX512F: cost of 26 {{.*}} uitofp
-  %1 = uitofp <8 x i64> %a to <8 x float>
-  ret <8 x float> %1
-}
-
-define <16 x float> @uitofpv16i64v16float(<16 x i64> %a) {
-  ; SSE2-LABEL: uitofpv16i64v16float
-  ; SSE2: cost of 120 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv16i64v16float
-  ; AVX1: cost of 43 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv16i64v16float
-  ; AVX2: cost of 43 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv16i64v16float
-  ; AVX512F: cost of 53 {{.*}} uitofp
-  %1 = uitofp <16 x i64> %a to <16 x float>
-  ret <16 x float> %1
-}
-
-define <32 x float> @uitofpv32i64v32float(<32 x i64> %a) {
-  ; SSE2-LABEL: uitofpv32i64v32float
-  ; SSE2: cost of 240 {{.*}} uitofp
-  ;
-  ; AVX1-LABEL: uitofpv32i64v32float
-  ; AVX1: cost of 87 {{.*}} uitofp
-  ;
-  ; AVX2-LABEL: uitofpv32i64v32float
-  ; AVX2: cost of 87 {{.*}} uitofp
-  ;
-  ; AVX512F-LABEL: uitofpv32i64v32float
-  ; AVX512F: cost of 107 {{.*}} uitofp
-  %1 = uitofp <32 x i64> %a to <32 x float>
-  ret <32 x float> %1
-}
-
-define <8 x i32> @fptouiv8f32v8i32(<8 x float> %a) {
-  ; AVX512F-LABEL: fptouiv8f32v8i32
-  ; AVX512F: cost of 1 {{.*}} fptoui
-  %1 = fptoui <8 x float> %a to <8 x i32>
-  ret <8 x i32> %1
-}
-
-define <4 x i32> @fptouiv4f32v4i32(<4 x float> %a) {
-  ; AVX512F-LABEL: fptouiv4f32v4i32
-  ; AVX512F: cost of 1 {{.*}} fptoui
-  %1 = fptoui <4 x float> %a to <4 x i32>
-  ret <4 x i32> %1
-}
-
-define <2 x i32> @fptouiv2f32v2i32(<2 x float> %a) {
-  ; AVX512F-LABEL: fptouiv2f32v2i32
-  ; AVX512F: cost of 1 {{.*}} fptoui
-  %1 = fptoui <2 x float> %a to <2 x i32>
-  ret <2 x i32> %1
-}
-
-define <16 x i32> @fptouiv16f32v16i32(<16 x float> %a) {
-  ; AVX512F-LABEL: fptouiv16f32v16i32
-  ; AVX512F: cost of 1 {{.*}} fptoui
-  %1 = fptoui <16 x float> %a to <16 x i32>
-  ret <16 x i32> %1
-}
-
-define <8 x i64> @fptouiv8f32v8i64(<8 x float> %a) {
-  ; AVX512DQ-LABEL: fptouiv8f32v8i64
-  ; AVX512DQ: cost of 1 {{.*}} fptoui
-  %1 = fptoui <8 x float> %a to <8 x i64>
-  ret <8 x i64> %1
-}
-
-define <4 x i64> @fptouiv4f32v4i64(<4 x float> %a) {
-  ; AVX512DQ-LABEL: fptouiv4f32v4i64
-  ; AVX512DQ: cost of 1 {{.*}} fptoui
-  %1 = fptoui <4 x float> %a to <4 x i64>
-  ret <4 x i64> %1
-}
-
-define <2 x i64> @fptouiv2f32v2i64(<2 x float> %a) {
-  ; AVX512DQ-LABEL: fptouiv2f32v2i64
-  ; AVX512DQ: cost of 1 {{.*}} fptoui
-  %1 = fptoui <2 x float> %a to <2 x i64>
-  ret <2 x i64> %1
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512f -cost-model -analyze < %s | FileCheck --check-prefix=AVX512 --check-prefix=AVX512F %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+avx512dq -cost-model -analyze < %s | FileCheck --check-prefix=AVX512 --check-prefix=AVX512DQ %s
+
+; CHECK-LABEL: 'uitofp_i8_double'
+define i32 @uitofp_i8_double() {
+  ; SSE2: cost of 1 {{.*}} uitofp i8
+  ; AVX1: cost of 1 {{.*}} uitofp i8
+  ; AVX2: cost of 1 {{.*}} uitofp i8
+  ; AVX512: cost of 1 {{.*}} uitofp i8
+  %cvt_i8_f64 = uitofp i8 undef to double
+
+  ; SSE2: cost of 20 {{.*}} uitofp <2 x i8>
+  ; AVX1: cost of 4 {{.*}} uitofp <2 x i8>
+  ; AVX2: cost of 4 {{.*}} uitofp <2 x i8>
+  ; AVX512: cost of 2 {{.*}} uitofp <2 x i8>
+  %cvt_v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+
+  ; SSE2: cost of 40 {{.*}} uitofp <4 x i8>
+  ; AVX1: cost of 2 {{.*}} uitofp <4 x i8>
+  ; AVX2: cost of 2 {{.*}} uitofp <4 x i8>
+  ; AVX512: cost of 2 {{.*}} uitofp <4 x i8>
+  %cvt_v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
+
+  ; SSE2: cost of 80 {{.*}} uitofp <8 x i8>
+  ; AVX1: cost of 5 {{.*}} uitofp <8 x i8>
+  ; AVX2: cost of 5 {{.*}} uitofp <8 x i8>
+  ; AVX512: cost of 2 {{.*}} uitofp <8 x i8>
+  %cvt_v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'uitofp_i16_double'
+define i32 @uitofp_i16_double() {
+  ; SSE2: cost of 1 {{.*}} uitofp i16
+  ; AVX1: cost of 1 {{.*}} uitofp i16
+  ; AVX2: cost of 1 {{.*}} uitofp i16
+  ; AVX512: cost of 1 {{.*}} uitofp i16
+  %cvt_i16_f64 = uitofp i16 undef to double
+
+  ; SSE2: cost of 20 {{.*}} uitofp <2 x i16>
+  ; AVX1: cost of 4 {{.*}} uitofp <2 x i16>
+  ; AVX2: cost of 4 {{.*}} uitofp <2 x i16>
+  ; AVX512: cost of 5 {{.*}} uitofp <2 x i16>
+  %cvt_v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+
+  ; SSE2: cost of 40 {{.*}} uitofp <4 x i16>
+  ; AVX1: cost of 2 {{.*}} uitofp <4 x i16>
+  ; AVX2: cost of 2 {{.*}} uitofp <4 x i16>
+  ; AVX512: cost of 2 {{.*}} uitofp <4 x i16>
+  %cvt_v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
+
+  ; SSE2: cost of 80 {{.*}} uitofp <8 x i16>
+  ; AVX1: cost of 5 {{.*}} uitofp <8 x i16>
+  ; AVX2: cost of 5 {{.*}} uitofp <8 x i16>
+  ; AVX512: cost of 2 {{.*}} uitofp <8 x i16>
+  %cvt_v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'uitofp_i32_double'
+define i32 @uitofp_i32_double() {
+  ; SSE2: cost of 1 {{.*}} uitofp i32
+  ; AVX1: cost of 1 {{.*}} uitofp i32
+  ; AVX2: cost of 1 {{.*}} uitofp i32
+  ; AVX512: cost of 1 {{.*}} uitofp i32
+  %cvt_i32_f64 = uitofp i32 undef to double
+
+  ; SSE2: cost of 20 {{.*}} uitofp <2 x i32>
+  ; AVX1: cost of 6 {{.*}} uitofp <2 x i32>
+  ; AVX2: cost of 6 {{.*}} uitofp <2 x i32>
+  ; AVX512: cost of 1 {{.*}} uitofp <2 x i32>
+  %cvt_v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+
+  ; SSE2: cost of 40 {{.*}} uitofp <4 x i32>
+  ; AVX1: cost of 6 {{.*}} uitofp <4 x i32>
+  ; AVX2: cost of 6 {{.*}} uitofp <4 x i32>
+  ; AVX512: cost of 1 {{.*}} uitofp <4 x i32>
+  %cvt_v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+
+  ; SSE2: cost of 80 {{.*}} uitofp <8 x i32>
+  ; AVX1: cost of 13 {{.*}} uitofp <8 x i32>
+  ; AVX2: cost of 13 {{.*}} uitofp <8 x i32>
+  ; AVX512: cost of 1 {{.*}} uitofp <8 x i32>
+  %cvt_v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'uitofp_i64_double'
+define i32 @uitofp_i64_double() {
+  ; SSE2: cost of 1 {{.*}} uitofp i64
+  ; AVX1: cost of 1 {{.*}} uitofp i64
+  ; AVX2: cost of 1 {{.*}} uitofp i64
+  ; AVX512: cost of 1 {{.*}} uitofp i64
+  %cvt_i64_f64 = uitofp i64 undef to double
+
+  ; SSE2: cost of 20 {{.*}} uitofp <2 x i64>
+  ; AVX1: cost of 10 {{.*}} uitofp <2 x i64>
+  ; AVX2: cost of 10 {{.*}} uitofp <2 x i64>
+  ; AVX512F: cost of 5 {{.*}} uitofp <2 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} uitofp <2 x i64>
+  %cvt_v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+
+  ; SSE2: cost of 40 {{.*}} uitofp <4 x i64>
+  ; AVX1: cost of 20 {{.*}} uitofp <4 x i64>
+  ; AVX2: cost of 20 {{.*}} uitofp <4 x i64>
+  ; AVX512F: cost of 12 {{.*}} uitofp <4 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} uitofp <4 x i64>
+  %cvt_v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+
+  ; SSE2: cost of 80 {{.*}} uitofp <8 x i64>
+  ; AVX1: cost of 41 {{.*}} uitofp <8 x i64>
+  ; AVX2: cost of 41 {{.*}} uitofp <8 x i64>
+  ; AVX512F: cost of 26 {{.*}} uitofp <8 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} uitofp <8 x i64>
+  %cvt_v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'uitofp_i8_float'
+define i32 @uitofp_i8_float() {
+  ; SSE2: cost of 1 {{.*}} uitofp i8
+  ; AVX1: cost of 1 {{.*}} uitofp i8
+  ; AVX2: cost of 1 {{.*}} uitofp i8
+  ; AVX512: cost of 1 {{.*}} uitofp i8
+  %cvt_i8_f32 = uitofp i8 undef to float
+
+  ; SSE2: cost of 8 {{.*}} uitofp <4 x i8>
+  ; AVX1: cost of 2 {{.*}} uitofp <4 x i8>
+  ; AVX2: cost of 2 {{.*}} uitofp <4 x i8>
+  ; AVX512: cost of 2 {{.*}} uitofp <4 x i8>
+  %cvt_v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
+
+  ; SSE2: cost of 15 {{.*}} uitofp <8 x i8>
+  ; AVX1: cost of 5 {{.*}} uitofp <8 x i8>
+  ; AVX2: cost of 5 {{.*}} uitofp <8 x i8>
+  ; AVX512: cost of 2 {{.*}} uitofp <8 x i8>
+  %cvt_v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
+
+  ; SSE2: cost of 8 {{.*}} uitofp <16 x i8>
+  ; AVX1: cost of 11 {{.*}} uitofp <16 x i8>
+  ; AVX16: cost of 11 {{.*}} uitofp <16 x i8>
+  ; AVX512: cost of 2 {{.*}} uitofp <16 x i8>
+  %cvt_v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'uitofp_i16_float'
+define i32 @uitofp_i16_float() {
+  ; SSE2: cost of 1 {{.*}} uitofp i16
+  ; AVX1: cost of 1 {{.*}} uitofp i16
+  ; AVX2: cost of 1 {{.*}} uitofp i16
+  ; AVX512: cost of 1 {{.*}} uitofp i16
+  %cvt_i16_f32 = uitofp i16 undef to float
+
+  ; SSE2: cost of 8 {{.*}} uitofp <4 x i16>
+  ; AVX1: cost of 2 {{.*}} uitofp <4 x i16>
+  ; AVX2: cost of 2 {{.*}} uitofp <4 x i16>
+  ; AVX512: cost of 2 {{.*}} uitofp <4 x i16>
+  %cvt_v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
+
+  ; SSE2: cost of 15 {{.*}} uitofp <8 x i16>
+  ; AVX1: cost of 5 {{.*}} uitofp <8 x i16>
+  ; AVX2: cost of 5 {{.*}} uitofp <8 x i16>
+  ; AVX512: cost of 2 {{.*}} uitofp <8 x i16>
+  %cvt_v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
+
+  ; SSE2: cost of 30 {{.*}} uitofp <16 x i16>
+  ; AVX1: cost of 11 {{.*}} uitofp <16 x i16>
+  ; AVX16: cost of 11 {{.*}} uitofp <16 x i16>
+  ; AVX512: cost of 2 {{.*}} uitofp <16 x i16>
+  %cvt_v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'uitofp_i32_float'
+define i32 @uitofp_i32_float() {
+  ; SSE2: cost of 1 {{.*}} uitofp i32
+  ; AVX1: cost of 1 {{.*}} uitofp i32
+  ; AVX2: cost of 1 {{.*}} uitofp i32
+  ; AVX512: cost of 1 {{.*}} uitofp i32
+  %cvt_i32_f32 = uitofp i32 undef to float
+
+  ; SSE2: cost of 8 {{.*}} uitofp <4 x i32>
+  ; AVX1: cost of 6 {{.*}} uitofp <4 x i32>
+  ; AVX2: cost of 6 {{.*}} uitofp <4 x i32>
+  ; AVX512: cost of 1 {{.*}} uitofp <4 x i32>
+  %cvt_v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+
+  ; SSE2: cost of 16 {{.*}} uitofp <8 x i32>
+  ; AVX1: cost of 9 {{.*}} uitofp <8 x i32>
+  ; AVX2: cost of 8 {{.*}} uitofp <8 x i32>
+  ; AVX512: cost of 1 {{.*}} uitofp <8 x i32>
+  %cvt_v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+
+  ; SSE2: cost of 32 {{.*}} uitofp <16 x i32>
+  ; AVX1: cost of 19 {{.*}} uitofp <16 x i32>
+  ; AVX2: cost of 17 {{.*}} uitofp <16 x i32>
+  ; AVX512: cost of 1 {{.*}} uitofp <16 x i32>
+  %cvt_v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+
+  ret i32 undef
+}
+
+; CHECK-LABEL: 'uitofp_i64_float'
+define i32 @uitofp_i64_float() {
+  ; SSE2: cost of 1 {{.*}} uitofp i64
+  ; AVX1: cost of 1 {{.*}} uitofp i64
+  ; AVX2: cost of 1 {{.*}} uitofp i64
+  ; AVX512: cost of 1 {{.*}} uitofp i64
+  %cvt_i64_f32 = uitofp i64 undef to float
+
+  ; SSE2: cost of 15 {{.*}} uitofp <2 x i64>
+  ; AVX1: cost of 4 {{.*}} uitofp <2 x i64>
+  ; AVX2: cost of 4 {{.*}} uitofp <2 x i64>
+  ; AVX512F: cost of 5 {{.*}} uitofp <2 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} uitofp <2 x i64>
+  %cvt_v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+
+  ; SSE2: cost of 30 {{.*}} uitofp <4 x i64>
+  ; AVX1: cost of 10 {{.*}} uitofp <4 x i64>
+  ; AVX2: cost of 10 {{.*}} uitofp <4 x i64>
+  ; AVX512F: cost of 10 {{.*}} uitofp <4 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} uitofp <4 x i64>
+  %cvt_v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+
+  ; SSE2: cost of 60 {{.*}} uitofp <8 x i64>
+  ; AVX1: cost of 21 {{.*}} uitofp <8 x i64>
+  ; AVX2: cost of 21 {{.*}} uitofp <8 x i64>
+  ; AVX512F: cost of 26 {{.*}} uitofp <8 x i64>
+  ; AVX512DQ: cost of 1 {{.*}} uitofp <8 x i64>
+  %cvt_v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+
+  ; SSE2: cost of 120 {{.*}} uitofp <16 x i64>
+  ; AVX1: cost of 43 {{.*}} uitofp <16 x i64>
+  ; AVX2: cost of 43 {{.*}} uitofp <16 x i64>
+  ; AVX512F: cost of 53 {{.*}} uitofp <16 x i64>
+  ; AVX512DQ: cost of 3 {{.*}} uitofp <16 x i64>
+  %cvt_v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+
+  ret i32 undef
 }
diff --git a/test/Analysis/CostModel/X86/uniformshift.ll b/test/Analysis/CostModel/X86/uniformshift.ll
new file mode 100644
index 000000000000..4fef50f2bf1d
--- /dev/null
+++ b/test/Analysis/CostModel/X86/uniformshift.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=+sse2 < %s | FileCheck --check-prefix=SSE2-CODEGEN %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s
+
+define <4 x i32> @shl(<4 x i32> %vector, i32 %scalar) {
+entry:
+  ; SSE2: 'shl'
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: movd  %edi, %xmm1
+  ; SSE2-CODEGEN: pslld %xmm1, %xmm0
+  %insert = insertelement <4 x i32> undef, i32 %scalar, i32 0
+  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %ret = shl <4 x i32> %vector , %splat
+  ret <4 x i32> %ret
+}
+
+define <4 x i32> @ashr(<4 x i32> %vector, i32 %scalar) {
+entry:
+  ; SSE2: 'ashr'
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN: movd  %edi, %xmm1
+  ; SSE2-CODEGEN: psrad %xmm1, %xmm0
+  %insert = insertelement <4 x i32> undef, i32 %scalar, i32 0
+  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %ret = ashr <4 x i32> %vector , %splat
+  ret <4 x i32> %ret
+}
+
+define <4 x i32> @lshr(<4 x i32> %vector, i32 %scalar) {
+entry:
+  ; SSE2: 'lshr'
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: movd  %edi, %xmm1
+  ; SSE2-CODEGEN: psrld %xmm1, %xmm0
+  %insert = insertelement <4 x i32> undef, i32 %scalar, i32 0
+  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %ret = lshr <4 x i32> %vector , %splat
+  ret <4 x i32> %ret
+}
+
diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
index a0d07d7b6ec0..e53e40b57e1d 100644
--- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -36,8 +36,8 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
-; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 12 for instruction:   %shift
+; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
@@ -48,8 +48,8 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
@@ -60,9 +60,9 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
-; AVX2: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 14 for instruction:   %shift
+; AVX: Found an estimated cost of 14 for instruction:   %shift
+; AVX2: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -71,8 +71,8 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i16':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
-; SSE41: Found an estimated cost of 64 for instruction:   %shift
-; AVX: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 28 for instruction:   %shift
+; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <16 x i16> %a, %b
@@ -82,9 +82,9 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i8':
 ; SSE2: Found an estimated cost of 54 for instruction:   %shift
-; SSE41: Found an estimated cost of 54 for instruction:   %shift
-; AVX: Found an estimated cost of 54 for instruction:   %shift
-; AVX2: Found an estimated cost of 54 for instruction:   %shift
+; SSE41: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
+; AVX2: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -93,8 +93,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i8':
 ; SSE2: Found an estimated cost of 108 for instruction:   %shift
-; SSE41: Found an estimated cost of 108 for instruction:   %shift
-; AVX: Found an estimated cost of 108 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <32 x i8> %a, %b
@@ -132,8 +132,8 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
-; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 12 for instruction:   %shift
+; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
@@ -145,8 +145,8 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
@@ -158,9 +158,9 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
-; AVX2: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 14 for instruction:   %shift
+; AVX: Found an estimated cost of 14 for instruction:   %shift
+; AVX2: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
   %shift = ashr <8 x i16> %a, %splat
@@ -170,8 +170,8 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i16':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
-; SSE41: Found an estimated cost of 64 for instruction:   %shift
-; AVX: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 28 for instruction:   %shift
+; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -182,9 +182,9 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i8':
 ; SSE2: Found an estimated cost of 54 for instruction:   %shift
-; SSE41: Found an estimated cost of 54 for instruction:   %shift
-; AVX: Found an estimated cost of 54 for instruction:   %shift
-; AVX2: Found an estimated cost of 54 for instruction:   %shift
+; SSE41: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
+; AVX2: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = ashr <16 x i8> %a, %splat
@@ -194,8 +194,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i8':
 ; SSE2: Found an estimated cost of 108 for instruction:   %shift
-; SSE41: Found an estimated cost of 108 for instruction:   %shift
-; AVX: Found an estimated cost of 108 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -232,8 +232,8 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
-; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 12 for instruction:   %shift
+; AVX: Found an estimated cost of 12 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
@@ -244,8 +244,8 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i32':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
@@ -256,9 +256,9 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
-; AVX2: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 14 for instruction:   %shift
+; AVX: Found an estimated cost of 14 for instruction:   %shift
+; AVX2: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -267,8 +267,8 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i16':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
-; SSE41: Found an estimated cost of 64 for instruction:   %shift
-; AVX: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 28 for instruction:   %shift
+; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
@@ -278,9 +278,9 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i8':
 ; SSE2: Found an estimated cost of 54 for instruction:   %shift
-; SSE41: Found an estimated cost of 54 for instruction:   %shift
-; AVX: Found an estimated cost of 54 for instruction:   %shift
-; AVX2: Found an estimated cost of 54 for instruction:   %shift
+; SSE41: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
+; AVX2: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -289,8 +289,8 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i8':
 ; SSE2: Found an estimated cost of 108 for instruction:   %shift
-; SSE41: Found an estimated cost of 108 for instruction:   %shift
-; AVX: Found an estimated cost of 108 for instruction:   %shift
+; SSE41: Found an estimated cost of 48 for instruction:   %shift
+; AVX: Found an estimated cost of 48 for instruction:   %shift
 ; AVX2: Found an estimated cost of 24 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
index a686b4368f21..6d028268ea55 100644
--- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -38,8 +38,8 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
-; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 11 for instruction:   %shift
+; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
@@ -50,8 +50,8 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 22 for instruction:   %shift
+; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
@@ -62,9 +62,9 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
-; AVX2: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 14 for instruction:   %shift
+; AVX: Found an estimated cost of 14 for instruction:   %shift
+; AVX2: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -73,8 +73,8 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i16':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
-; SSE41: Found an estimated cost of 64 for instruction:   %shift
-; AVX: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 28 for instruction:   %shift
+; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <16 x i16> %a, %b
@@ -84,9 +84,9 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
-; SSE41: Found an estimated cost of 26 for instruction:   %shift
-; AVX: Found an estimated cost of 26 for instruction:   %shift
-; AVX2: Found an estimated cost of 26 for instruction:   %shift
+; SSE41: Found an estimated cost of 12 for instruction:   %shift
+; AVX: Found an estimated cost of 12 for instruction:   %shift
+; AVX2: Found an estimated cost of 12 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -95,8 +95,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i8':
 ; SSE2: Found an estimated cost of 52 for instruction:   %shift
-; SSE41: Found an estimated cost of 52 for instruction:   %shift
-; AVX: Found an estimated cost of 52 for instruction:   %shift
+; SSE41: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <32 x i8> %a, %b
@@ -136,8 +136,8 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) {
 define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
-; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 11 for instruction:   %shift
+; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
@@ -149,8 +149,8 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
 define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 22 for instruction:   %shift
+; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
@@ -162,9 +162,9 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
-; AVX2: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 14 for instruction:   %shift
+; AVX: Found an estimated cost of 14 for instruction:   %shift
+; AVX2: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
   %shift = lshr <8 x i16> %a, %splat
@@ -174,8 +174,8 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i16':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
-; SSE41: Found an estimated cost of 64 for instruction:   %shift
-; AVX: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 28 for instruction:   %shift
+; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -186,9 +186,9 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
-; SSE41: Found an estimated cost of 26 for instruction:   %shift
-; AVX: Found an estimated cost of 26 for instruction:   %shift
-; AVX2: Found an estimated cost of 26 for instruction:   %shift
+; SSE41: Found an estimated cost of 12 for instruction:   %shift
+; AVX: Found an estimated cost of 12 for instruction:   %shift
+; AVX2: Found an estimated cost of 12 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = lshr <16 x i8> %a, %splat
@@ -198,8 +198,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i8':
 ; SSE2: Found an estimated cost of 52 for instruction:   %shift
-; SSE41: Found an estimated cost of 52 for instruction:   %shift
-; AVX: Found an estimated cost of 52 for instruction:   %shift
+; SSE41: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -238,8 +238,8 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) {
 define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32':
 ; SSE2: Found an estimated cost of 16 for instruction:   %shift
-; SSE41: Found an estimated cost of 16 for instruction:   %shift
-; AVX: Found an estimated cost of 16 for instruction:   %shift
+; SSE41: Found an estimated cost of 11 for instruction:   %shift
+; AVX: Found an estimated cost of 11 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 2 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
@@ -250,8 +250,8 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
 define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i32':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 22 for instruction:   %shift
+; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 1 for instruction:   %shift
 ; XOPAVX: Found an estimated cost of 4 for instruction:   %shift
 ; XOPAVX2: Found an estimated cost of 1 for instruction:   %shift
@@ -262,9 +262,9 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
 define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
-; AVX2: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 14 for instruction:   %shift
+; AVX: Found an estimated cost of 14 for instruction:   %shift
+; AVX2: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -273,8 +273,8 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i16':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
-; SSE41: Found an estimated cost of 64 for instruction:   %shift
-; AVX: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 28 for instruction:   %shift
+; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
@@ -284,9 +284,9 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
-; SSE41: Found an estimated cost of 26 for instruction:   %shift
-; AVX: Found an estimated cost of 26 for instruction:   %shift
-; AVX2: Found an estimated cost of 26 for instruction:   %shift
+; SSE41: Found an estimated cost of 12 for instruction:   %shift
+; AVX: Found an estimated cost of 12 for instruction:   %shift
+; AVX2: Found an estimated cost of 12 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -295,8 +295,8 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i8':
 ; SSE2: Found an estimated cost of 52 for instruction:   %shift
-; SSE41: Found an estimated cost of 52 for instruction:   %shift
-; AVX: Found an estimated cost of 52 for instruction:   %shift
+; SSE41: Found an estimated cost of 24 for instruction:   %shift
+; AVX: Found an estimated cost of 24 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 4 for instruction:   %shift
   %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
diff --git a/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
index 85ca5a5a7f32..60ba3adea42a 100644
--- a/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -63,9 +63,9 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
-; AVX2: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 14 for instruction:   %shift
+; AVX: Found an estimated cost of 14 for instruction:   %shift
+; AVX2: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -74,8 +74,8 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i16':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
-; SSE41: Found an estimated cost of 64 for instruction:   %shift
-; AVX: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 28 for instruction:   %shift
+; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <16 x i16> %a, %b
@@ -85,9 +85,9 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
-; SSE41: Found an estimated cost of 26 for instruction:   %shift
-; AVX: Found an estimated cost of 26 for instruction:   %shift
-; AVX2: Found an estimated cost of 26 for instruction:   %shift
+; SSE41: Found an estimated cost of 11 for instruction:   %shift
+; AVX: Found an estimated cost of 11 for instruction:   %shift
+; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -96,8 +96,8 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i8':
 ; SSE2: Found an estimated cost of 52 for instruction:   %shift
-; SSE41: Found an estimated cost of 52 for instruction:   %shift
-; AVX: Found an estimated cost of 52 for instruction:   %shift
+; SSE41: Found an estimated cost of 22 for instruction:   %shift
+; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i8> %a, %b
@@ -163,9 +163,9 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
 define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i16':
 ; SSE2: Found an estimated cost of 32 for instruction:   %shift
-; SSE41: Found an estimated cost of 32 for instruction:   %shift
-; AVX: Found an estimated cost of 32 for instruction:   %shift
-; AVX2: Found an estimated cost of 32 for instruction:   %shift
+; SSE41: Found an estimated cost of 14 for instruction:   %shift
+; AVX: Found an estimated cost of 14 for instruction:   %shift
+; AVX2: Found an estimated cost of 14 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
   %shift = shl <8 x i16> %a, %splat
@@ -175,8 +175,8 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i16':
 ; SSE2: Found an estimated cost of 64 for instruction:   %shift
-; SSE41: Found an estimated cost of 64 for instruction:   %shift
-; AVX: Found an estimated cost of 64 for instruction:   %shift
+; SSE41: Found an estimated cost of 28 for instruction:   %shift
+; AVX: Found an estimated cost of 28 for instruction:   %shift
 ; AVX2: Found an estimated cost of 10 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -187,9 +187,9 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
-; SSE41: Found an estimated cost of 26 for instruction:   %shift
-; AVX: Found an estimated cost of 26 for instruction:   %shift
-; AVX2: Found an estimated cost of 26 for instruction:   %shift
+; SSE41: Found an estimated cost of 11 for instruction:   %shift
+; AVX: Found an estimated cost of 11 for instruction:   %shift
+; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
   %shift = shl <16 x i8> %a, %splat
@@ -199,8 +199,8 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i8':
 ; SSE2: Found an estimated cost of 52 for instruction:   %shift
-; SSE41: Found an estimated cost of 52 for instruction:   %shift
-; AVX: Found an estimated cost of 52 for instruction:   %shift
+; SSE41: Found an estimated cost of 22 for instruction:   %shift
+; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -286,9 +286,9 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i8':
 ; SSE2: Found an estimated cost of 26 for instruction:   %shift
-; SSE41: Found an estimated cost of 26 for instruction:   %shift
-; AVX: Found an estimated cost of 26 for instruction:   %shift
-; AVX2: Found an estimated cost of 26 for instruction:   %shift
+; SSE41: Found an estimated cost of 11 for instruction:   %shift
+; AVX: Found an estimated cost of 11 for instruction:   %shift
+; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 1 for instruction:   %shift
   %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -297,8 +297,8 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i8':
 ; SSE2: Found an estimated cost of 52 for instruction:   %shift
-; SSE41: Found an estimated cost of 52 for instruction:   %shift
-; AVX: Found an estimated cost of 52 for instruction:   %shift
+; SSE41: Found an estimated cost of 22 for instruction:   %shift
+; AVX: Found an estimated cost of 22 for instruction:   %shift
 ; AVX2: Found an estimated cost of 11 for instruction:   %shift
 ; XOP: Found an estimated cost of 2 for instruction:   %shift
   %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>