45 files changed, 1867 insertions, 2957 deletions
diff --git a/test/CodeGen/AArch64/aarch64-addv.ll b/test/CodeGen/AArch64/aarch64-addv.ll
index 91797c062b88..e65992e9913d 100644
--- a/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/test/CodeGen/AArch64/aarch64-addv.ll
@@ -1,18 +1,16 @@
 ; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic | FileCheck %s
 
+; Function Attrs: nounwind readnone
+declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
+declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>)
+declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>)
+declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>)
+
 define i8 @add_B(<16 x i8>* %arr)  {
 ; CHECK-LABEL: add_B
 ; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
   %bin.rdx = load <16 x i8>, <16 x i8>* %arr
-  %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0
-  %rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef >
-  %bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf
-  %rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
-  %bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12
-  %rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
-  %bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13
-  %r = extractelement <16 x i8> %bin.rdx14, i32 0
+  %r = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %bin.rdx)
   ret i8 %r
 }
 
@@ -20,13 +18,7 @@ define i16 @add_H(<8 x i16>* %arr)  {
 ; CHECK-LABEL: add_H
 ; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
   %bin.rdx = load <8 x i16>, <8 x i16>* %arr
-  %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef>
-  %bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf
-  %rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12
-  %rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13
-  %r = extractelement <8 x i16> %bin.rdx14, i32 0
+  %r = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %bin.rdx)
   ret i16 %r
 }
 
@@ -34,11 +26,7 @@ define i32 @add_S( <4 x i32>* %arr)  {
 ; CHECK-LABEL: add_S
 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
   %bin.rdx = load <4 x i32>, <4 x i32>* %arr
-  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf
-  %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12
-  %r = extractelement <4 x i32> %bin.rdx13, i32 0
+  %r = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %bin.rdx)
   ret i32 %r
 }
 
@@ -46,12 +34,12 @@ define i64 @add_D(<2 x i64>* %arr)  {
 ; CHECK-LABEL: add_D
 ; CHECK-NOT: addv
   %bin.rdx = load <2 x i64>, <2 x i64>* %arr
-  %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-  %bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0
-  %r = extractelement <2 x i64> %bin.rdx0, i32 0
+  %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %bin.rdx)
   ret i64 %r
 }
 
+declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
+
 define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias nocapture readonly %arg2) {
 ; CHECK-LABEL: oversized_ADDV_256
 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
@@ -66,33 +54,16 @@ entry:
   %7 = icmp slt <8 x i32> %6, zeroinitializer
   %8 = sub nsw <8 x i32> zeroinitializer, %6
   %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
-  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin.rdx = add <8 x i32> %9, %rdx.shuf
-  %rdx.shuf1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin.rdx2 = add <8 x i32> %bin.rdx, %rdx.shuf1
-  %rdx.shuf3 = shufflevector <8 x i32> %bin.rdx2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin.rdx4 = add <8 x i32> %bin.rdx2, %rdx.shuf3
-  %10 = extractelement <8 x i32> %bin.rdx4, i32 0
-  ret i32 %10
+  %r = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %9)
+  ret i32 %r
 }
 
+declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>)
+
 define i32 @oversized_ADDV_512(<16 x i32>* %arr)  {
 ; CHECK-LABEL: oversized_ADDV_512
 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
   %bin.rdx = load <16 x i32>, <16 x i32>* %arr
-
-  %rdx.shuf0 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin.rdx0 = add <16 x i32> %bin.rdx, %rdx.shuf0
-
-  %rdx.shuf = shufflevector <16 x i32> %bin.rdx0, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef >
-  %bin.rdx11 = add <16 x i32> %bin.rdx0, %rdx.shuf
-
-  %rdx.shuf12 = shufflevector <16 x i32> %bin.rdx11, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
-  %bin.rdx13 = add <16 x i32> %bin.rdx11, %rdx.shuf12
-
-  %rdx.shuf13 = shufflevector <16 x i32> %bin.rdx13, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
-  %bin.rdx14 = add <16 x i32> %bin.rdx13, %rdx.shuf13
-
-  %r = extractelement <16 x i32> %bin.rdx14, i32 0
+  %r = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> %bin.rdx)
   ret i32 %r
 }
diff --git a/test/CodeGen/AArch64/aarch64-minmaxv.ll b/test/CodeGen/AArch64/aarch64-minmaxv.ll
index 9a56cd6ae7c0..760a8f8419f9 100644
--- a/test/CodeGen/AArch64/aarch64-minmaxv.ll
+++ b/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -2,344 +2,148 @@
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
+declare i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8>)
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16>)
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32>)
+declare i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8>)
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16>)
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32>)
+
+declare i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8>)
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16>)
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32>)
+declare i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8>)
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16>)
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32>)
+
+declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float>)
+declare float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float>)
+
 ; CHECK-LABEL: smax_B
 ; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 define i8 @smax_B(<16 x i8>* nocapture readonly %arr)  {
   %arr.load = load <16 x i8>, <16 x i8>* %arr
-  %rdx.shuf = shufflevector <16 x i8> %arr.load, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp22 = icmp sgt <16 x i8> %arr.load, %rdx.shuf
-  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %arr.load, <16 x i8> %rdx.shuf
-  %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp25 = icmp sgt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
-  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
-  %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp28 = icmp sgt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
-  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
-  %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp31 = icmp sgt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
-  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
-  %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
-  %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
-  %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+  %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %arr.load)
   ret i8 %r
 }
 
 ; CHECK-LABEL: smax_H
 ; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 define i16 @smax_H(<8 x i16>* nocapture readonly %arr) {
-  %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
-  %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp23 = icmp sgt <8 x i16> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
-  %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp26 = icmp sgt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
-  %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
-  %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp29 = icmp sgt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
-  %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
-  %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
-  %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
-  %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+  %arr.load = load <8 x i16>, <8 x i16>* %arr
+  %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %arr.load)
   ret i16 %r
 }
 
 ; CHECK-LABEL: smax_S
 ; CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 define i32 @smax_S(<4 x i32> * nocapture readonly %arr)  {
-  %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
-  %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %rdx.minmax.cmp18 = icmp sgt <4 x i32> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
-  %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp21 = icmp sgt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
-  %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
-  %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
-  %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
-  %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+  %arr.load = load <4 x i32>, <4 x i32>* %arr
+  %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %arr.load)
   ret i32 %r
 }
 
-; CHECK-LABEL: smax_D
-; CHECK-NOT: smaxv
-define i64 @smax_D(<2 x i64>* nocapture readonly %arr) {
-  %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
-  %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-  %rdx.minmax.cmp18 = icmp sgt <2 x i64> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
-  %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
-  %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
-  %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
-  ret i64 %r
-}
-
-
 ; CHECK-LABEL: umax_B
 ; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 define i8 @umax_B(<16 x i8>* nocapture readonly %arr)  {
-  %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
-  %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp22 = icmp ugt <16 x i8> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
-  %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp25 = icmp ugt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
-  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
-  %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp28 = icmp ugt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
-  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
-  %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp31 = icmp ugt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
-  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
-  %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
-  %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
-  %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+  %arr.load = load <16 x i8>, <16 x i8>* %arr
+  %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %arr.load)
   ret i8 %r
 }
 
 ; CHECK-LABEL: umax_H
 ; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 define i16 @umax_H(<8 x i16>* nocapture readonly %arr)  {
-  %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
-  %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp23 = icmp ugt <8 x i16> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
-  %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp26 = icmp ugt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
-  %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
-  %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp29 = icmp ugt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
-  %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
-  %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
-  %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
-  %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+  %arr.load = load <8 x i16>, <8 x i16>* %arr
+  %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %arr.load)
   ret i16 %r
 }
 
 ; CHECK-LABEL: umax_S
 ; CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 define i32 @umax_S(<4 x i32>* nocapture readonly %arr) {
-  %rdx.minmax.select  = load <4 x i32>, <4 x i32>* %arr
-  %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %rdx.minmax.cmp18 = icmp ugt <4 x i32> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
-  %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp21 = icmp ugt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
-  %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
-  %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
-  %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
-  %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+  %arr.load = load <4 x i32>, <4 x i32>* %arr
+  %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %arr.load)
   ret i32 %r
 }
 
-; CHECK-LABEL: umax_D
-; CHECK-NOT: umaxv
-define i64 @umax_D(<2 x i64>* nocapture readonly %arr)  {
-  %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
-  %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-  %rdx.minmax.cmp18 = icmp ugt <2 x i64> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
-  %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
-  %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
-  %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
-  ret i64 %r
-}
-
-
 ; CHECK-LABEL: smin_B
 ; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 define i8 @smin_B(<16 x i8>* nocapture readonly %arr) {
-  %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
-  %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp22 = icmp slt <16 x i8> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
-  %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp25 = icmp slt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
-  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
-  %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp28 = icmp slt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
-  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
-  %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp31 = icmp slt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
-  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
-  %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
-  %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
-  %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+  %arr.load = load <16 x i8>, <16 x i8>* %arr
+  %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %arr.load)
   ret i8 %r
 }
 
 ; CHECK-LABEL: smin_H
 ; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 define i16 @smin_H(<8 x i16>* nocapture readonly %arr) {
-  %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
-  %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp23 = icmp slt <8 x i16> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
-  %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp26 = icmp slt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
-  %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
-  %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp29 = icmp slt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
-  %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
-  %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
-  %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
-  %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+  %arr.load = load <8 x i16>, <8 x i16>* %arr
+  %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %arr.load)
   ret i16 %r
 }
 
 ; CHECK-LABEL: smin_S
 ; CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 define i32 @smin_S(<4 x i32>* nocapture readonly %arr) {
-  %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
-  %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %rdx.minmax.cmp18 = icmp slt <4 x i32> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
-  %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp21 = icmp slt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
-  %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
-  %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
-  %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
-  %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+  %arr.load = load <4 x i32>, <4 x i32>* %arr
+  %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %arr.load)
   ret i32 %r
 }
 
-; CHECK-LABEL: smin_D
-; CHECK-NOT: sminv
-define i64 @smin_D(<2 x i64>* nocapture readonly %arr) {
-  %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
-  %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-  %rdx.minmax.cmp18 = icmp slt <2 x i64> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
-  %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
-  %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
-  %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
-  ret i64 %r
-}
-
-
 ; CHECK-LABEL: umin_B
 ; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 define i8 @umin_B(<16 x i8>* nocapture readonly %arr)  {
-  %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
-  %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp22 = icmp ult <16 x i8> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
-  %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp25 = icmp ult <16 x i8> %rdx.minmax.select23, %rdx.shuf24
-  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
-  %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp28 = icmp ult <16 x i8> %rdx.minmax.select26, %rdx.shuf27
-  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
-  %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp31 = icmp ult <16 x i8> %rdx.minmax.select29, %rdx.shuf30
-  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
-  %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
-  %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
-  %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+  %arr.load = load <16 x i8>, <16 x i8>* %arr
+  %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %arr.load)
   ret i8 %r
 }
 
 ; CHECK-LABEL: umin_H
 ; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 define i16 @umin_H(<8 x i16>* nocapture readonly %arr)  {
-  %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
-  %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp23 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
-  %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp26 = icmp ult <8 x i16> %rdx.minmax.select24, %rdx.shuf25
-  %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
-  %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp29 = icmp ult <8 x i16> %rdx.minmax.select27, %rdx.shuf28
-  %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
-  %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
-  %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
-  %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+  %arr.load = load <8 x i16>, <8 x i16>* %arr
+  %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %arr.load)
   ret i16 %r
 }
 
 ; CHECK-LABEL: umin_S
 ; CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 define i32 @umin_S(<4 x i32>* nocapture readonly %arr) {
-  %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
-  %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %rdx.minmax.cmp18 = icmp ult <4 x i32> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
-  %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp21 = icmp ult <4 x i32> %rdx.minmax.select19, %rdx.shuf20
-  %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
-  %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
-  %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
-  %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+  %arr.load = load <4 x i32>, <4 x i32>* %arr
+  %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %arr.load)
   ret i32 %r
 }
 
-; CHECK-LABEL: umin_D
-; CHECK-NOT: uminv
-define i64 @umin_D(<2 x i64>* nocapture readonly %arr)  {
-  %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
-  %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
-  %rdx.minmax.cmp18 = icmp ult <2 x i64> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
-  %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
-  %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
-  %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
-  ret i64 %r
-}
-
 ; CHECK-LABEL: fmaxnm_S
 ; CHECK: fmaxnmv
 define float @fmaxnm_S(<4 x float>* nocapture readonly %arr) {
-  %rdx.minmax.select  = load <4 x float>, <4 x float>* %arr
-  %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %rdx.minmax.cmp = fcmp fast oge <4 x float> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf
-  %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp1 = fcmp fast oge <4 x float> %rdx.minmax.select1, %rdx.shuf1
-  %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0
-  %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0
-  %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1
-  %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt
+  %arr.load  = load <4 x float>, <4 x float>* %arr
+  %r = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %arr.load)
   ret float %r
 }
 
 ; CHECK-LABEL: fminnm_S
 ; CHECK: fminnmv
 define float @fminnm_S(<4 x float>* nocapture readonly %arr) {
-  %rdx.minmax.select  = load <4 x float>, <4 x float>* %arr
-  %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %rdx.minmax.cmp = fcmp fast ole <4 x float> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf
-  %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp1 = fcmp fast ole <4 x float> %rdx.minmax.select1, %rdx.shuf1
-  %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0
-  %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0
-  %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1
-  %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt
+  %arr.load  = load <4 x float>, <4 x float>* %arr
+  %r = call nnan float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %arr.load)
   ret float %r
 }
 
+declare i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16>)
+
 define i16 @oversized_umax_256(<16 x i16>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_umax_256
 ; CHECK: umax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: umaxv {{h[0-9]+}}, [[V0]]
-  %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
-  %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp22 = icmp ugt <16 x i16> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
-  %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp25 = icmp ugt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
-  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
-  %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp28 = icmp ugt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
-  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
-  %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp31 = icmp ugt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
-  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
-  %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
-  %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
-  %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+  %arr.load = load <16 x i16>, <16 x i16>* %arr
+  %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> %arr.load)
   ret i16 %r
 }
 
+declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32>)
+
 define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_umax_512
 ; CHECK: umax v
@@ -347,47 +151,23 @@ define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-NEXT: umax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: umaxv {{s[0-9]+}}, [[V0]]
   %arr.load = load <16 x i32>, <16 x i32>* %arr
-  %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp22 = icmp ugt <16 x i32> %arr.load, %rdx.shuf
-  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
-  %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp25 = icmp ugt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
-  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
-  %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp28 = icmp ugt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
-  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
-  %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp31 = icmp ugt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
-  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
-  %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
-  %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
-  %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+  %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> %arr.load)
   ret i32 %r
 }
 
+declare i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16>)
+
 define i16 @oversized_umin_256(<16 x i16>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_umin_256
 ; CHECK: umin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: uminv {{h[0-9]+}}, [[V0]]
-  %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
-  %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp22 = icmp ult <16 x i16> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
-  %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp25 = icmp ult <16 x i16> %rdx.minmax.select23, %rdx.shuf24
-  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
-  %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp28 = icmp ult <16 x i16> %rdx.minmax.select26, %rdx.shuf27
-  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
-  %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp31 = icmp ult <16 x i16> %rdx.minmax.select29, %rdx.shuf30
-  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
-  %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
-  %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
-  %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+  %arr.load = load <16 x i16>, <16 x i16>* %arr
+  %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> %arr.load)
   ret i16 %r
 }
 
+declare i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32>)
+
 define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_umin_512
 ; CHECK: umin v
@@ -395,47 +175,23 @@ define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-NEXT: umin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: uminv {{s[0-9]+}}, [[V0]]
   %arr.load = load <16 x i32>, <16 x i32>* %arr
-  %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp22 = icmp ult <16 x i32> %arr.load, %rdx.shuf
-  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
-  %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp25 = icmp ult <16 x i32> %rdx.minmax.select23, %rdx.shuf24
-  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
-  %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp28 = icmp ult <16 x i32> %rdx.minmax.select26, %rdx.shuf27
-  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
-  %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp31 = icmp ult <16 x i32> %rdx.minmax.select29, %rdx.shuf30
-  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
-  %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
-  %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
-  %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+  %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> %arr.load)
   ret i32 %r
 }
 
+declare i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16>)
+
 define i16 @oversized_smax_256(<16 x i16>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_smax_256
 ; CHECK: smax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: smaxv {{h[0-9]+}}, [[V0]]
   %arr.load = load <16 x i16>, <16 x i16>* %arr
-  %rdx.shuf = shufflevector <16 x i16> %arr.load, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp22 = icmp sgt <16 x i16> %arr.load, %rdx.shuf
-  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %arr.load, <16 x i16> %rdx.shuf
-  %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp25 = icmp sgt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
-  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
-  %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp28 = icmp sgt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
-  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
-  %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp31 = icmp sgt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
-  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
-  %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
-  %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
-  %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+  %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> %arr.load)
   ret i16 %r
 }
 
+declare i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32>)
+
 define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_smax_512
 ; CHECK: smax v
@@ -443,47 +199,23 @@ define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-NEXT: smax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: smaxv {{s[0-9]+}}, [[V0]]
   %arr.load = load <16 x i32>, <16 x i32>* %arr
-  %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp22 = icmp sgt <16 x i32> %arr.load, %rdx.shuf
-  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
-  %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp25 = icmp sgt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
-  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
-  %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp28 = icmp sgt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
-  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
-  %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp31 = icmp sgt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
-  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
-  %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
-  %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
-  %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+  %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> %arr.load)
   ret i32 %r
 }
 
+declare i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16>)
+
 define i16 @oversized_smin_256(<16 x i16>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_smin_256
 ; CHECK: smin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 ; CHECK: sminv {{h[0-9]+}}, [[V0]]
-  %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
-  %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp22 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf
-  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
-  %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp25 = icmp slt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
-  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
-  %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp28 = icmp slt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
-  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
-  %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp31 = icmp slt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
-  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
-  %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
-  %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
-  %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+  %arr.load = load <16 x i16>, <16 x i16>* %arr
+  %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> %arr.load)
   ret i16 %r
 }
 
+declare i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32>)
+
 define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-LABEL: oversized_smin_512
 ; CHECK: smin v
@@ -491,20 +223,6 @@ define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr)  {
 ; CHECK-NEXT: smin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 ; CHECK-NEXT: sminv {{s[0-9]+}}, [[V0]]
   %arr.load = load <16 x i32>, <16 x i32>* %arr
-  %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp22 = icmp slt <16 x i32> %arr.load, %rdx.shuf
-  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
-  %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp25 = icmp slt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
-  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
-  %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp28 = icmp slt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
-  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
-  %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %rdx.minmax.cmp31 = icmp slt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
-  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
-  %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
-  %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
-  %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+  %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> %arr.load)
   ret i32 %r
 }
diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
index c7b0c33550d0..ff7a0a8300e2 100644
--- a/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -134,8 +134,10 @@ define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   ret <2 x i64> %tmp4
 }
 
-define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) {
-; CHECK-LABEL: uabdl8h_log2_shuffle
+declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>)
+
+define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) {
+; CHECK-LABEL: uabdl8h_rdx
 ; CHECK: uabdl2.8h
 ; CHECK: uabdl.8h
   %aload = load <16 x i8>, <16 x i8>* %a, align 1
@@ -146,20 +148,14 @@ define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) {
   %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
   %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
   %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
-  %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin1.rdx = add <16 x i16> %absel, %rdx.shuf
-  %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx
-  %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136
-  %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138
-  %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0
+  %reduced_v = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %absel)
   ret i16 %reduced_v
 }
 
-define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) {
-; CHECK-LABEL: uabdl4s_log2_shuffle
+declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>)
+
+define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) {
+; CHECK-LABEL: uabdl4s_rdx
 ; CHECK: uabdl2.4s
 ; CHECK: uabdl.4s
   %aload = load <8 x i16>, <8 x i16>* %a, align 1
@@ -170,18 +166,14 @@ define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) {
   %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
   %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
   %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
-  %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin.rdx = add <8 x i32> %absel, %rdx.shuf
-  %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136
-  %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138
-  %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0
+  %reduced_v = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %absel)
   ret i32 %reduced_v
 }
 
-define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
-; CHECK: uabdl2d_log2_shuffle
+declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>)
+
+define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
+; CHECK: uabdl2d_rdx
 ; CHECK: uabdl2.2d
 ; CHECK: uabdl.2d
   %aload = load <4 x i32>, <4 x i32>* %a, align 1
@@ -192,11 +184,7 @@ define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
   %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
   %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
   %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
-  %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136
-  %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138
-  %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0
+  %reduced_v = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> %absel)
   ret i64 %reduced_v
 }
 
diff --git a/test/CodeGen/AArch64/ldst-zero.ll b/test/CodeGen/AArch64/ldst-zero.ll
index 95b92ac70879..7d443a631f91 100644
--- a/test/CodeGen/AArch64/ldst-zero.ll
+++ b/test/CodeGen/AArch64/ldst-zero.ll
@@ -9,9 +9,9 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
 ; Original test case which exhibited the bug
 define void @test1(%struct.tree_common* %t, i32 %code, i8* %type) {
 ; CHECK-LABEL: test1:
-; CHECK: stp xzr, xzr, [x0, #8]
-; CHECK: stp xzr, x2, [x0]
-; CHECK: str w1, [x0, #16]
+; CHECK-DAG: stp x2, xzr, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: str xzr, [x0]
 entry:
   %0 = bitcast %struct.tree_common* %t to i8*
   tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 24, i32 8, i1 false)
@@ -25,10 +25,8 @@ entry:
 ; Store to each struct element instead of using memset
 define void @test2(%struct.tree_common* %t, i32 %code, i8* %type) {
 ; CHECK-LABEL: test2:
-; CHECK: stp xzr, xzr, [x0]
-; CHECK: str wzr, [x0, #16]
-; CHECK: str w1, [x0, #16]
-; CHECK: str x2, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: stp xzr, x2, [x0]
 entry:
   %0 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 0
   %1 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 1
@@ -44,9 +42,9 @@ entry:
 ; Vector store instead of memset
 define void @test3(%struct.tree_common* %t, i32 %code, i8* %type) {
 ; CHECK-LABEL: test3:
-; CHECK: stp xzr, xzr, [x0, #8]
-; CHECK: stp xzr, x2, [x0]
-; CHECK: str w1, [x0, #16]
+; CHECK-DAG: stp x2, xzr, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: str xzr, [x0]
 entry:
   %0 = bitcast %struct.tree_common* %t to <3 x i64>*
   store <3 x i64> zeroinitializer, <3 x i64>* %0, align 8
@@ -60,9 +58,8 @@ entry:
 ; Vector store, then store to vector elements
 define void @test4(<3 x i64>* %p, i64 %x, i64 %y) {
 ; CHECK-LABEL: test4:
-; CHECK: stp xzr, xzr, [x0, #8]
-; CHECK: stp xzr, x2, [x0]
-; CHECK: str x1, [x0, #16]
+; CHECK-DAG: stp x2, x1, [x0, #8]
+; CHECK-DAG: str xzr, [x0]
 entry:
   store <3 x i64> zeroinitializer, <3 x i64>* %p, align 8
   %0 = bitcast <3 x i64>* %p to i64*
diff --git a/test/CodeGen/AArch64/misched-stp.ll b/test/CodeGen/AArch64/misched-stp.ll
index 4ea481cae68e..1c9ea68834c2 100644
--- a/test/CodeGen/AArch64/misched-stp.ll
+++ b/test/CodeGen/AArch64/misched-stp.ll
@@ -1,20 +1,18 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=aarch64 -mcpu=cyclone -mattr=+use-aa -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mcpu=cyclone -mattr=+use-aa -enable-misched -verify-misched -o - | FileCheck %s
 
 ; Tests to check that the scheduler dependencies derived from alias analysis are
 ; correct when we have loads that have been split up so that they can later be
 ; merged into STP.
 
-; CHECK: ********** MI Scheduling **********
-; CHECK: test_splat:BB#0 entry
-; CHECK: SU({{[0-9]+}}):   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 3; mem:ST4[%3+8]
-; CHECK: Successors:
-; CHECK-NEXT: ord  [[SU1:SU\([0-9]+\)]]
-; CHECK: SU({{[0-9]+}}):   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 2; mem:ST4[%3+4]
-; CHECK: Successors:
-; CHECK-NEXT: ord  [[SU2:SU\([0-9]+\)]]
-; CHECK: [[SU1]]:   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 3; mem:ST4[%2]
-; CHECK: [[SU2]]:   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 2; mem:ST4[%1]
+; Now that overwritten stores are elided in SelectionDAG, dependencies
+; are resolved and removed before MISCHED. Check that we have
+; equivalent pair of stp calls as a baseline.
+
+; CHECK-LABEL: test_splat
+; CHECK:     ldr [[REG:w[0-9]+]], [x2]
+; CHECK-DAG: stp w0, [[REG]], [x2, #12]
+; CHECK-DAG: stp [[REG]], w1, [x2, #4]
 define void @test_splat(i32 %x, i32 %y, i32* %p) {
 entry:
   %val = load i32, i32* %p, align 4
@@ -35,16 +33,11 @@ entry:
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
 %struct.tree_common = type { i8*, i8*, i32 }
 
-; CHECK: ********** MI Scheduling **********
-; CHECK: test_zero:BB#0 entry
-; CHECK: SU({{[0-9]+}}):   STRXui %XZR, %vreg{{[0-9]+}}, 2; mem:ST8[%0+16]
-; CHECK: Successors:
-; CHECK-NEXT: ord  [[SU3:SU\([0-9]+\)]]
-; CHECK: SU({{[0-9]+}}):   STRXui %XZR, %vreg{{[0-9]+}}, 1; mem:ST8[%0+8]
-; CHECK: Successors:
-; CHECK-NEXT: ord  [[SU4:SU\([0-9]+\)]]
-; CHECK: [[SU3]]:   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 4; mem:ST4[%code1]
-; CHECK: [[SU4]]:   STRXui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 1; mem:ST8[%type2]
+; CHECK-LABEL: test_zero
+; CHECK-DAG: stp x2, xzr, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: str xzr, [x0]
+
 define void @test_zero(%struct.tree_common* %t, i32 %code, i8* %type) {
 entry:
   %0 = bitcast %struct.tree_common* %t to i8*
diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll
index a96eb5db9e2a..2e6d3f3c1e8f 100644
--- a/test/CodeGen/AMDGPU/fmax3.ll
+++ b/test/CodeGen/AMDGPU/fmax3.ll
@@ -1,39 +1,92 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.maxnum.f32(float, float) nounwind readnone
-
-; SI-LABEL: {{^}}test_fmax3_olt_0:
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; GCN-LABEL: {{^}}test_fmax3_olt_0_f32:
+; GCN: buffer_load_dword [[REGC:v[0-9]+]]
+; GCN: buffer_load_dword [[REGB:v[0-9]+]]
+; GCN: buffer_load_dword [[REGA:v[0-9]+]]
+; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define amdgpu_kernel void @test_fmax3_olt_0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
   %a = load volatile  float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
-  %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
-  %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone
+  %f0 = call float @llvm.maxnum.f32(float %a, float %b)
+  %f1 = call float @llvm.maxnum.f32(float %f0, float %c)
   store float %f1, float addrspace(1)* %out, align 4
   ret void
 }
 
 ; Commute operand of second fmax
-; SI-LABEL: {{^}}test_fmax3_olt_1:
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+; GCN-LABEL: {{^}}test_fmax3_olt_1_f32:
+; GCN: buffer_load_dword [[REGB:v[0-9]+]]
+; GCN: buffer_load_dword [[REGA:v[0-9]+]]
+; GCN: buffer_load_dword [[REGC:v[0-9]+]]
+; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_dword [[RESULT]],
+; GCN: s_endpgm
+define amdgpu_kernel void @test_fmax3_olt_1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
-  %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
-  %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone
+  %f0 = call float @llvm.maxnum.f32(float %a, float %b)
+  %f1 = call float @llvm.maxnum.f32(float %c, float %f0)
   store float %f1, float addrspace(1)* %out, align 4
   ret void
 }
+
+; GCN-LABEL: {{^}}test_fmax3_olt_0_f16:
+; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
+
+; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]],
+; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]]
+
+; VI: v_max_f16_e32
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
+
+; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_short [[RESULT]],
+define amdgpu_kernel void @test_fmax3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
+  %a = load volatile  half, half addrspace(1)* %aptr, align 2
+  %b = load volatile half, half addrspace(1)* %bptr, align 2
+  %c = load volatile half, half addrspace(1)* %cptr, align 2
+  %f0 = call half @llvm.maxnum.f16(half %a, half %b)
+  %f1 = call half @llvm.maxnum.f16(half %f0, half %c)
+  store half %f1, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; Commute operand of second fmax
+; GCN-LABEL: {{^}}test_fmax3_olt_1_f16:
+; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
+
+; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]],
+; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]]
+
+; VI: v_max_f16_e32
+; VI: v_max_f16_e32 [[RESULT:v[0-9]+]],
+
+; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_short [[RESULT]],
+define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
+  %a = load volatile half, half addrspace(1)* %aptr, align 2
+  %b = load volatile half, half addrspace(1)* %bptr, align 2
+  %c = load volatile half, half addrspace(1)* %cptr, align 2
+  %f0 = call half @llvm.maxnum.f16(half %a, half %b)
+  %f1 = call half @llvm.maxnum.f16(half %c, half %f0)
+  store half %f1, half addrspace(1)* %out, align 2
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare half @llvm.maxnum.f16(half, half) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll
index 3183f77f090b..5fc5895c3ecb 100644
--- a/test/CodeGen/AMDGPU/fmin3.ll
+++ b/test/CodeGen/AMDGPU/fmin3.ll
@@ -1,40 +1,90 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-declare float @llvm.minnum.f32(float, float) nounwind readnone
-
-; SI-LABEL: {{^}}test_fmin3_olt_0:
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; GCN-LABEL: {{^}}test_fmin3_olt_0_f32:
+; GCN: buffer_load_dword [[REGC:v[0-9]+]]
+; GCN: buffer_load_dword [[REGB:v[0-9]+]]
+; GCN: buffer_load_dword [[REGA:v[0-9]+]]
+; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_dword [[RESULT]],
+define amdgpu_kernel void @test_fmin3_olt_0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
-  %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
-  %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone
+  %f0 = call float @llvm.minnum.f32(float %a, float %b)
+  %f1 = call float @llvm.minnum.f32(float %f0, float %c)
   store float %f1, float addrspace(1)* %out, align 4
   ret void
 }
 
 ; Commute operand of second fmin
-; SI-LABEL: {{^}}test_fmin3_olt_1:
-; SI: buffer_load_dword [[REGB:v[0-9]+]]
-; SI: buffer_load_dword [[REGA:v[0-9]+]]
-; SI: buffer_load_dword [[REGC:v[0-9]+]]
-; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; SI: buffer_store_dword [[RESULT]],
-; SI: s_endpgm
-define amdgpu_kernel void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+; GCN-LABEL: {{^}}test_fmin3_olt_1_f32:
+; GCN: buffer_load_dword [[REGB:v[0-9]+]]
+; GCN: buffer_load_dword [[REGA:v[0-9]+]]
+; GCN: buffer_load_dword [[REGC:v[0-9]+]]
+; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_dword [[RESULT]],
+define amdgpu_kernel void @test_fmin3_olt_1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
-  %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
-  %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone
+  %f0 = call float @llvm.minnum.f32(float %a, float %b)
+  %f1 = call float @llvm.minnum.f32(float %c, float %f0)
   store float %f1, float addrspace(1)* %out, align 4
   ret void
 }
+
+; GCN-LABEL: {{^}}test_fmin3_olt_0_f16:
+; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
+
+; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]],
+; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]]
+
+; VI: v_min_f16_e32
+; VI: v_min_f16_e32 [[RESULT:v[0-9]+]],
+
+; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_short [[RESULT]],
+define amdgpu_kernel void @test_fmin3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
+  %a = load volatile half, half addrspace(1)* %aptr, align 2
+  %b = load volatile half, half addrspace(1)* %bptr, align 2
+  %c = load volatile half, half addrspace(1)* %cptr, align 2
+  %f0 = call half @llvm.minnum.f16(half %a, half %b)
+  %f1 = call half @llvm.minnum.f16(half %f0, half %c)
+  store half %f1, half addrspace(1)* %out, align 2
+  ret void
+}
+
+; Commute operand of second fmin
+; GCN-LABEL: {{^}}test_fmin3_olt_1_f16:
+; GCN: buffer_load_ushort [[REGB:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGA:v[0-9]+]]
+; GCN: buffer_load_ushort [[REGC:v[0-9]+]]
+
+; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]],
+; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]]
+
+; VI: v_min_f16_e32
+; VI: v_min_f16_e32 [[RESULT:v[0-9]+]],
+
+; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
+; GCN: buffer_store_short [[RESULT]],
+define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
+  %a = load volatile half, half addrspace(1)* %aptr, align 2
+  %b = load volatile half, half addrspace(1)* %bptr, align 2
+  %c = load volatile half, half addrspace(1)* %cptr, align 2
+  %f0 = call half @llvm.minnum.f16(half %a, half %b)
+  %f1 = call half @llvm.minnum.f16(half %c, half %f0)
+  store half %f1, half addrspace(1)* %out, align 2
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.minnum.f32(float, float) #1
+declare half @llvm.minnum.f16(half, half) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/global-constant.ll b/test/CodeGen/AMDGPU/global-constant.ll
index 80acfcca7082..1898c8fb63ea 100644
--- a/test/CodeGen/AMDGPU/global-constant.ll
+++ b/test/CodeGen/AMDGPU/global-constant.ll
@@ -29,10 +29,10 @@
 define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
   %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index
   %val = load float, float addrspace(2)* %ptr
-  store float %val, float addrspace(1)* %out
+  store volatile float %val, float addrspace(1)* %out
   %ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @private2, i32 0, i32 %index
   %val2 = load float, float addrspace(2)* %ptr2
-  store float %val2, float addrspace(1)* %out
+  store volatile float %val2, float addrspace(1)* %out
   ret void
 }
 
diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll
index c15a30e3c540..96132d841997 100644
--- a/test/CodeGen/AMDGPU/immv216.ll
+++ b/test/CodeGen/AMDGPU/immv216.ll
@@ -288,9 +288,9 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace
 }
 
 ; GCN-LABEL: {{^}}commute_add_literal_v2f16:
-; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
-; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x64006400
-; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]]
+; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]]
+; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]] op_sel_hi:[0,1]{{$}}
 ; GFX9: buffer_store_dword [[REG]]
 
 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
diff --git a/test/CodeGen/AMDGPU/max3.ll b/test/CodeGen/AMDGPU/max3.ll
index 4bb4fd46becd..46dcf8e340f4 100644
--- a/test/CodeGen/AMDGPU/max3.ll
+++ b/test/CodeGen/AMDGPU/max3.ll
@@ -1,41 +1,94 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-
-; FUNC-LABEL: @v_test_imax3_sgt_i32
-; SI: v_max3_i32
-define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; GCN-LABEL: {{^}}v_test_imax3_sgt_i32:
+; GCN: v_max3_i32
+define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
   %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %a = load i32, i32 addrspace(1)* %gep0
+  %b = load i32, i32 addrspace(1)* %gep1
+  %c = load i32, i32 addrspace(1)* %gep2
   %icmp0 = icmp sgt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
   %icmp1 = icmp sgt i32 %i0, %c
   %i1 = select i1 %icmp1, i32 %i0, i32 %c
-  store i32 %i1, i32 addrspace(1)* %out, align 4
+  store i32 %i1, i32 addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: @v_test_umax3_ugt_i32
-; SI: v_max3_u32
-define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; GCN-LABEL: {{^}}v_test_umax3_ugt_i32:
+; GCN: v_max3_u32
+define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
   %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %a = load i32, i32 addrspace(1)* %gep0
+  %b = load i32, i32 addrspace(1)* %gep1
+  %c = load i32, i32 addrspace(1)* %gep2
   %icmp0 = icmp ugt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
   %icmp1 = icmp ugt i32 %i0, %c
   %i1 = select i1 %icmp1, i32 %i0, i32 %c
-  store i32 %i1, i32 addrspace(1)* %out, align 4
+  store i32 %i1, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_imax3_sgt_i16:
+; SI: v_max3_i32
+
+; VI: v_max_i16
+; VI: v_max_i16
+
+; GFX9: v_max3_i16
+define amdgpu_kernel void @v_test_imax3_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+  %a = load i16, i16 addrspace(1)* %gep0
+  %b = load i16, i16 addrspace(1)* %gep1
+  %c = load i16, i16 addrspace(1)* %gep2
+  %icmp0 = icmp sgt i16 %a, %b
+  %i0 = select i1 %icmp0, i16 %a, i16 %b
+  %icmp1 = icmp sgt i16 %i0, %c
+  %i1 = select i1 %icmp1, i16 %i0, i16 %c
+  store i16 %i1, i16 addrspace(1)* %out
   ret void
 }
+
+; GCN-LABEL: {{^}}v_test_umax3_ugt_i16:
+; SI: v_max3_u32
+
+; VI: v_max_u16
+; VI: v_max_u16
+
+; GFX9: v_max3_u16
+define amdgpu_kernel void @v_test_umax3_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+  %a = load i16, i16 addrspace(1)* %gep0
+  %b = load i16, i16 addrspace(1)* %gep1
+  %c = load i16, i16 addrspace(1)* %gep2
+  %icmp0 = icmp ugt i16 %a, %b
+  %i0 = select i1 %icmp0, i16 %a, i16 %b
+  %icmp1 = icmp ugt i16 %i0, %c
+  %i1 = select i1 %icmp1, i16 %i0, i16 %c
+  store i16 %i1, i16 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/min3.ll b/test/CodeGen/AMDGPU/min3.ll
index 59d5d2cdb1aa..e20fb81f2ecf 100644
--- a/test/CodeGen/AMDGPU/min3.ll
+++ b/test/CodeGen/AMDGPU/min3.ll
@@ -1,50 +1,50 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-
-; FUNC-LABEL: @v_test_imin3_slt_i32
-; SI: v_min3_i32
-define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+
+; GCN-LABEL: {{^}}v_test_imin3_slt_i32:
+; GCN: v_min3_i32
+define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
   %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %a = load i32, i32 addrspace(1)* %gep0
+  %b = load i32, i32 addrspace(1)* %gep1
+  %c = load i32, i32 addrspace(1)* %gep2
   %icmp0 = icmp slt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
   %icmp1 = icmp slt i32 %i0, %c
   %i1 = select i1 %icmp1, i32 %i0, i32 %c
-  store i32 %i1, i32 addrspace(1)* %outgep, align 4
+  store i32 %i1, i32 addrspace(1)* %outgep
   ret void
 }
 
-; FUNC-LABEL: @v_test_umin3_ult_i32
-; SI: v_min3_u32
-define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; GCN-LABEL: {{^}}v_test_umin3_ult_i32:
+; GCN: v_min3_u32
+define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
   %gep2 = getelementptr i32, i32 addrspace(1)* %cptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %c = load i32, i32 addrspace(1)* %gep2, align 4
+  %a = load i32, i32 addrspace(1)* %gep0
+  %b = load i32, i32 addrspace(1)* %gep1
+  %c = load i32, i32 addrspace(1)* %gep2
   %icmp0 = icmp ult i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
   %icmp1 = icmp ult i32 %i0, %c
   %i1 = select i1 %icmp1, i32 %i0, i32 %c
-  store i32 %i1, i32 addrspace(1)* %outgep, align 4
+  store i32 %i1, i32 addrspace(1)* %outgep
   ret void
 }
 
-; FUNC-LABEL: @v_test_umin_umin_umin
-; SI: v_min_i32
-; SI: v_min3_i32
-define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; GCN-LABEL: {{^}}v_test_umin_umin_umin:
+; GCN: v_min_i32
+; GCN: v_min3_i32
+define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid2 = mul i32 %tid, 2
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -57,10 +57,10 @@ define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 add
   %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
 
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %c = load i32, i32 addrspace(1)* %gep2, align 4
-  %d = load i32, i32 addrspace(1)* %gep3, align 4
+  %a = load i32, i32 addrspace(1)* %gep0
+  %b = load i32, i32 addrspace(1)* %gep1
+  %c = load i32, i32 addrspace(1)* %gep2
+  %d = load i32, i32 addrspace(1)* %gep3
 
   %icmp0 = icmp slt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
@@ -71,14 +71,14 @@ define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 add
   %icmp2 = icmp slt i32 %i0, %i1
   %i2 = select i1 %icmp2, i32 %i0, i32 %i1
 
-  store i32 %i2, i32 addrspace(1)* %outgep1, align 4
+  store i32 %i2, i32 addrspace(1)* %outgep1
   ret void
 }
 
-; FUNC-LABEL: @v_test_umin3_2_uses
-; SI-NOT: v_min3
-define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
-  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; GCN-LABEL: {{^}}v_test_umin3_2_uses:
+; GCN-NOT: v_min3
+define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid2 = mul i32 %tid, 2
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -91,10 +91,10 @@ define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrs
   %outgep0 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %outgep1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tid2
 
-  %a = load i32, i32 addrspace(1)* %gep0, align 4
-  %b = load i32, i32 addrspace(1)* %gep1, align 4
-  %c = load i32, i32 addrspace(1)* %gep2, align 4
-  %d = load i32, i32 addrspace(1)* %gep3, align 4
+  %a = load i32, i32 addrspace(1)* %gep0
+  %b = load i32, i32 addrspace(1)* %gep1
+  %c = load i32, i32 addrspace(1)* %gep2
+  %d = load i32, i32 addrspace(1)* %gep3
 
   %icmp0 = icmp slt i32 %a, %b
   %i0 = select i1 %icmp0, i32 %a, i32 %b
@@ -105,7 +105,60 @@ define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrs
   %icmp2 = icmp slt i32 %i0, %c
   %i2 = select i1 %icmp2, i32 %i0, i32 %c
 
-  store i32 %i2, i32 addrspace(1)* %outgep0, align 4
-  store i32 %i0, i32 addrspace(1)* %outgep1, align 4
+  store i32 %i2, i32 addrspace(1)* %outgep0
+  store i32 %i0, i32 addrspace(1)* %outgep1
   ret void
 }
+
+; GCN-LABEL: {{^}}v_test_imin3_slt_i16:
+; SI: v_min3_i32
+
+; VI: v_min_i16
+; VI: v_min_i16
+
+; GFX9: v_min3_i16
+define amdgpu_kernel void @v_test_imin3_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+  %a = load i16, i16 addrspace(1)* %gep0
+  %b = load i16, i16 addrspace(1)* %gep1
+  %c = load i16, i16 addrspace(1)* %gep2
+  %icmp0 = icmp slt i16 %a, %b
+  %i0 = select i1 %icmp0, i16 %a, i16 %b
+  %icmp1 = icmp slt i16 %i0, %c
+  %i1 = select i1 %icmp1, i16 %i0, i16 %c
+  store i16 %i1, i16 addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_umin3_ult_i16:
+; SI: v_min3_u32
+
+; VI: v_min_u16
+; VI: v_min_u16
+
+; GFX9: v_min3_u16
+define amdgpu_kernel void @v_test_umin3_ult_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 addrspace(1)* %cptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr i16, i16 addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
+  %a = load i16, i16 addrspace(1)* %gep0
+  %b = load i16, i16 addrspace(1)* %gep1
+  %c = load i16, i16 addrspace(1)* %gep2
+  %icmp0 = icmp ult i16 %a, %b
+  %i0 = select i1 %icmp0, i16 %a, i16 %b
+  %icmp1 = icmp ult i16 %i0, %c
+  %i1 = select i1 %icmp1, i16 %i0, i16 %c
+  store i16 %i1, i16 addrspace(1)* %outgep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/packed-op-sel.ll b/test/CodeGen/AMDGPU/packed-op-sel.ll
new file mode 100644
index 000000000000..6ff0c54c33d0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -0,0 +1,266 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
+
+; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
+define amdgpu_kernel void @fma_vector_vector_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+  %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
+  %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.broadcast)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; Apply fneg to broadcasted vector
+; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+  %scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
+  %scalar0.broadcast = shufflevector <2 x half> %scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+  %neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %scalar0.broadcast
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; Apply fneg before broadcast
+; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+  %neg.scalar0 = fsub half -0.0, %scalar0
+  %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
+  %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.broadcast)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; Apply fneg before and after broadcast, and should cancel out.
+; GCN-LABEL: {{^}}fma_vector_vector_neg_broadcast_neg_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+  %neg.scalar0 = fsub half -0.0, %scalar0
+  %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
+  %neg.scalar0.broadcast = shufflevector <2 x half> %neg.scalar0.vec, <2 x half> undef, <2 x i32> zeroinitializer
+  %neg.neg.scalar0.broadcast = fsub <2 x half> <half -0.0, half -0.0>, %neg.scalar0.broadcast
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.scalar0.broadcast)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; Add scalar, but negate low component
+; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+  %neg.scalar0 = fsub half -0.0, %scalar0
+  %neg.scalar0.vec = insertelement <2 x half> undef, half %neg.scalar0, i32 0
+  %neg.scalar0.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %scalar0, i32 1
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.scalar0.scalar0)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; Add scalar, but negate high component
+; GCN-LABEL: {{^}}fma_vector_vector_scalar_neg_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[SCALAR0]] op_sel_hi:[1,1,0] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+
+  %neg.scalar0 = fsub half -0.0, %scalar0
+  %neg.scalar0.vec = insertelement <2 x half> undef, half %scalar0, i32 0
+  %scalar0.neg.scalar0 = insertelement <2 x half> %neg.scalar0.vec, half %neg.scalar0, i32 1
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %scalar0.neg.scalar0)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; Apply fneg before broadcast with bitcast
+; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+
+; GCN-NOT: pack
+; GCN-NOT: and
+; GCN-NOT: shl
+; GCN-NOT: or
+
+; GCN: v_xor_b32_e32 [[NEG_SCALAR0:v[0-9]+]], 0x8000, [[SCALAR0]]
+; GCN-NEXT: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[NEG_SCALAR0]] op_sel_hi:[1,0]{{$}}
+define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+  %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+  %neg.scalar0 = fsub half -0.0, %scalar0
+  %neg.scalar0.bc = bitcast half %neg.scalar0 to i16
+
+  %neg.scalar0.vec = insertelement <2 x i16> undef, i16 %neg.scalar0.bc, i32 0
+  %neg.scalar0.broadcast = shufflevector <2 x i16> %neg.scalar0.vec, <2 x i16> undef, <2 x i32> zeroinitializer
+
+  %result = add <2 x i16> %vec0, %neg.scalar0.broadcast
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_scalar_lo_neg_scalar_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
+
+; FIXME: Remove and
+; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
+; GCN: v_xor_b32_e32 [[SCALAR1]], 0x8000, [[SCALAR1]]
+; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]]{{$}}
+define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+  %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
+
+  %neg.scalar1 = fsub half -0.0, %scalar1
+  %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
+  %vec2 = insertelement <2 x half> %vec.ins0, half %neg.scalar1, i32 1
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
+; GCN: ds_read_b32 [[VEC0:v[0-9]+]]
+; GCN: ds_read_b32 [[VEC1:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR0:v[0-9]+]]
+; GCN: ds_read_u16 [[SCALAR1:v[0-9]+]]
+
+; FIXME: Remove and
+; GCN: v_and_b32_e32 [[SCALAR0]], 0xffff, [[SCALAR0]]
+; GCN: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[SCALAR1]], 16, [[SCALAR0]]
+
+; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[PACKED]] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}}
+define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 {
+bb:
+  %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1
+  %arg2.gep = getelementptr inbounds half, half addrspace(3)* %arg2, i32 2
+
+  %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4
+  %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4
+
+  %scalar0 = load volatile half, half addrspace(3)* %arg2, align 2
+  %scalar1 = load volatile half, half addrspace(3)* %arg2.gep, align 2
+
+  %vec.ins0 = insertelement <2 x half> undef, half %scalar0, i32 0
+  %vec2 = insertelement <2 x half> %vec.ins0, half %scalar1, i32 1
+  %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2
+
+  %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll b/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
index 8403dd991360..777eccb00b02 100644
--- a/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
+++ b/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
@@ -20,7 +20,7 @@ bb3:                                              ; preds = %bb, %entry
 
 bb8:                                              ; preds = %bb3
   %1 = getelementptr inbounds i8, i8* %0, i32 0
-  store i8 0, i8* %1, align 1
+  store volatile i8 0, i8* %1, align 1
   %2 = call i32 @ptou() nounwind
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
@@ -35,7 +35,7 @@ bb8:                                              ; preds = %bb3
   %7 = or i8 %6, 48
   %8 = add i8 %6, 87
   %iftmp.5.0.1 = select i1 %5, i8 %7, i8 %8
-  store i8 %iftmp.5.0.1, i8* %p8, align 1
+  store volatile i8 %iftmp.5.0.1, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -49,7 +49,7 @@ bb8:                                              ; preds = %bb3
   %13 = or i8 %12, 48
   %14 = add i8 %12, 87
   %iftmp.5.0.2 = select i1 %11, i8 %13, i8 %14
-  store i8 %iftmp.5.0.2, i8* %p8, align 1
+  store volatile i8 %iftmp.5.0.2, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -63,7 +63,7 @@ bb8:                                              ; preds = %bb3
   %19 = or i8 %18, 48
   %20 = add i8 %18, 87
   %iftmp.5.0.4 = select i1 %17, i8 %19, i8 %20
-  store i8 %iftmp.5.0.4, i8* null, align 1
+  store volatile i8 %iftmp.5.0.4, i8* null, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -74,7 +74,7 @@ bb8:                                              ; preds = %bb3
   %22 = urem i32 %21, 10
   %23 = icmp ult i32 %22, 10
   %iftmp.5.0.5 = select i1 %23, i8 0, i8 %val8
-  store i8 %iftmp.5.0.5, i8* %p8, align 1
+  store volatile i8 %iftmp.5.0.5, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -88,7 +88,7 @@ bb8:                                              ; preds = %bb3
   %28 = or i8 %27, 48
   %29 = add i8 %27, 87
   %iftmp.5.0.6 = select i1 %26, i8 %28, i8 %29
-  store i8 %iftmp.5.0.6, i8* %p8, align 1
+  store volatile i8 %iftmp.5.0.6, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -102,7 +102,7 @@ bb8:                                              ; preds = %bb3
   %34 = or i8 %33, 48
   %35 = add i8 %33, 87
   %iftmp.5.0.7 = select i1 %32, i8 %34, i8 %35
-  store i8 %iftmp.5.0.7, i8* %p8, align 1
+  store volatile i8 %iftmp.5.0.7, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -116,7 +116,7 @@ bb8:                                              ; preds = %bb3
   %40 = or i8 %39, 48
   %41 = add i8 %39, 87
   %iftmp.5.0.8 = select i1 %38, i8 %40, i8 %41
-  store i8 %iftmp.5.0.8, i8* null, align 1
+  store volatile i8 %iftmp.5.0.8, i8* null, align 1
   br label %bb46
 
 bb46:                                             ; preds = %bb3
diff --git a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
index 2a5af6199a34..954860219d19 100644
--- a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
+++ b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
@@ -13,7 +13,7 @@ entry:
 ; CHECK: sub       sp, sp, #12
 ; CHECK: sub       sp, sp, #4
 ; CHECK: add       r0, sp, #4
-; CHECK: stm       sp, {r0, r1, r2, r3}
+; CHECK: stmib     sp, {r1, r2, r3}
   %g = alloca i8*
   %g1 = bitcast i8** %g to i8*
   call void @llvm.va_start(i8* %g1)
diff --git a/test/CodeGen/ARM/dag-combine-ldst.ll b/test/CodeGen/ARM/dag-combine-ldst.ll
index c1960ee6c6e9..077754ef013d 100644
--- a/test/CodeGen/ARM/dag-combine-ldst.ll
+++ b/test/CodeGen/ARM/dag-combine-ldst.ll
@@ -8,7 +8,7 @@
 ; CHECK-LABEL:   {{^}}main
 ; CHECK:         mov [[TMP:r[0-9]+]], #0
 ; CHECK-NEXT:    str [[TMP]], [sp, #4]
-; CHECK-NEXT:    str [[TMP]], [sp]
+; CHECK_O0:      str [[TMP]], [sp]
 ; CHECK_O0:      ldr [[TMP:r[0-9]+]], [sp]
 ; CHECK_O0-NEXT: add [[TMP]], [[TMP]], #2
 ; CHECK_O1-NOT:  ldr [[TMP:r[0-9]+]], [sp]
diff --git a/test/CodeGen/MSP430/vararg.ll b/test/CodeGen/MSP430/vararg.ll
index 6c8bceff5de9..a708b89cbd8f 100644
--- a/test/CodeGen/MSP430/vararg.ll
+++ b/test/CodeGen/MSP430/vararg.ll
@@ -25,7 +25,6 @@ define i16 @va_arg(i8* %vl) nounwind {
 entry:
 ; CHECK-LABEL: va_arg:
   %vl.addr = alloca i8*, align 2
-; CHECK: mov.w r12, 0(r1)
   store i8* %vl, i8** %vl.addr, align 2
 ; CHECK: mov.w r12, [[REG:r[0-9]+]]
 ; CHECK-NEXT: add.w #2, [[REG]]
diff --git a/test/CodeGen/Mips/msa/bmzi_bmnzi.ll b/test/CodeGen/Mips/msa/bmzi_bmnzi.ll
index d1cb3c348c73..de62dcd69403 100644
--- a/test/CodeGen/Mips/msa/bmzi_bmnzi.ll
+++ b/test/CodeGen/Mips/msa/bmzi_bmnzi.ll
@@ -9,9 +9,9 @@ entry:
   %0 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG1
   %1 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG2
   %2 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 240)
-  store <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  store volatile <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
   %3 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 15)
-  store <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  store volatile <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
   %4 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 170)
   store <16 x i8> %4, <16 x i8>* @llvm_mips_bmnzi_b_RES
   ret void
@@ -32,9 +32,9 @@ entry:
   %0 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG1
   %1 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG2
   %2 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 240)
-  store <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  store volatile <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
   %3 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 15)
-  store <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  store volatile <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
   %4 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 170)
   store <16 x i8> %4, <16 x i8>* @llvm_mips_bmnzi_b_RES
   ret void
diff --git a/test/CodeGen/PowerPC/atomic-2.ll b/test/CodeGen/PowerPC/atomic-2.ll
index 18715ddb37c6..2039c1f57f17 100644
--- a/test/CodeGen/PowerPC/atomic-2.ll
+++ b/test/CodeGen/PowerPC/atomic-2.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=ppc64 | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
-; RUN: llc < %s -march=ppc64le -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
-; RUN: llc < %s -march=ppc64 -mcpu=pwr7 | FileCheck %s
-; RUN: llc < %s -march=ppc64 -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-P8U
+; RUN: llc < %s -ppc-asm-full-reg-names -march=ppc64 | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-BE
+; RUN: llc < %s -ppc-asm-full-reg-names -march=ppc64le -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-LE
+; RUN: llc < %s -ppc-asm-full-reg-names -march=ppc64 -mcpu=pwr7 | FileCheck %s
+; RUN: llc < %s -ppc-asm-full-reg-names -march=ppc64 -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-P8U
 
 define i64 @exchange_and_add(i64* %mem, i64 %val) nounwind {
 ; CHECK-LABEL: exchange_and_add:
@@ -108,8 +108,10 @@ entry:
 ; CHECK: @atomic_load
   %tmp = load atomic i64, i64* %mem acquire, align 64
 ; CHECK-NOT: ldarx
-; CHECK: ld
-; CHECK: lwsync
+; CHECK: ld [[VAL:r[0-9]+]]
+; CHECK: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
+; CHECK: bne- [[CR]], .+4
+; CHECK: isync
   ret i64 %tmp
 }
 
diff --git a/test/CodeGen/PowerPC/atomics-indexed.ll b/test/CodeGen/PowerPC/atomics-indexed.ll
index 7a0dde034d68..cfe15f0061c4 100644
--- a/test/CodeGen/PowerPC/atomics-indexed.ll
+++ b/test/CodeGen/PowerPC/atomics-indexed.ll
@@ -10,16 +10,22 @@
 define i8 @load_x_i8_seq_cst([100000 x i8]* %mem) {
 ; CHECK-LABEL: load_x_i8_seq_cst
 ; CHECK: sync
-; CHECK: lbzx
-; CHECK: lwsync
+; CHECK: lbzx [[VAL:r[0-9]+]]
+; CHECK-PPC32: lwsync
+; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
+; CHECK-PPC64: bne- [[CR]], .+4
+; CHECK-PPC64: isync
   %ptr = getelementptr inbounds [100000 x i8], [100000 x i8]* %mem, i64 0, i64 90000
   %val = load atomic i8, i8* %ptr seq_cst, align 1
   ret i8 %val
 }
 define i16 @load_x_i16_acquire([100000 x i16]* %mem) {
 ; CHECK-LABEL: load_x_i16_acquire
-; CHECK: lhzx
-; CHECK: lwsync
+; CHECK: lhzx [[VAL:r[0-9]+]]
+; CHECK-PPC32: lwsync
+; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
+; CHECK-PPC64: bne- [[CR]], .+4
+; CHECK-PPC64: isync
   %ptr = getelementptr inbounds [100000 x i16], [100000 x i16]* %mem, i64 0, i64 90000
   %val = load atomic i16, i16* %ptr acquire, align 2
   ret i16 %val
diff --git a/test/CodeGen/PowerPC/atomics-regression.ll b/test/CodeGen/PowerPC/atomics-regression.ll
index 9af82b625532..054d3a4146b0 100644
--- a/test/CodeGen/PowerPC/atomics-regression.ll
+++ b/test/CodeGen/PowerPC/atomics-regression.ll
@@ -23,7 +23,9 @@ define i8 @test2(i8* %ptr) {
 ; PPC64LE-LABEL: test2:
 ; PPC64LE:       # BB#0:
 ; PPC64LE-NEXT:    lbz 3, 0(3)
-; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    bne- 7, .+4
+; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
   %val = load atomic i8, i8* %ptr acquire, align 1
   ret i8 %val
@@ -35,7 +37,9 @@ define i8 @test3(i8* %ptr) {
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:    ori 2, 2, 0
 ; PPC64LE-NEXT:    lbz 3, 0(3)
-; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    bne- 7, .+4
+; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
   %val = load atomic i8, i8* %ptr seq_cst, align 1
   ret i8 %val
@@ -63,7 +67,9 @@ define i16 @test6(i16* %ptr) {
 ; PPC64LE-LABEL: test6:
 ; PPC64LE:       # BB#0:
 ; PPC64LE-NEXT:    lhz 3, 0(3)
-; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    bne- 7, .+4
+; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
   %val = load atomic i16, i16* %ptr acquire, align 2
   ret i16 %val
@@ -75,7 +81,9 @@ define i16 @test7(i16* %ptr) {
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:    ori 2, 2, 0
 ; PPC64LE-NEXT:    lhz 3, 0(3)
-; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    bne- 7, .+4
+; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
   %val = load atomic i16, i16* %ptr seq_cst, align 2
   ret i16 %val
@@ -103,7 +111,9 @@ define i32 @test10(i32* %ptr) {
 ; PPC64LE-LABEL: test10:
 ; PPC64LE:       # BB#0:
 ; PPC64LE-NEXT:    lwz 3, 0(3)
-; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    bne- 7, .+4
+; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
   %val = load atomic i32, i32* %ptr acquire, align 4
   ret i32 %val
@@ -115,7 +125,9 @@ define i32 @test11(i32* %ptr) {
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:    ori 2, 2, 0
 ; PPC64LE-NEXT:    lwz 3, 0(3)
-; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    bne- 7, .+4
+; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
   %val = load atomic i32, i32* %ptr seq_cst, align 4
   ret i32 %val
@@ -143,7 +155,9 @@ define i64 @test14(i64* %ptr) {
 ; PPC64LE-LABEL: test14:
 ; PPC64LE:       # BB#0:
 ; PPC64LE-NEXT:    ld 3, 0(3)
-; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    bne- 7, .+4
+; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
   %val = load atomic i64, i64* %ptr acquire, align 8
   ret i64 %val
@@ -155,7 +169,9 @@ define i64 @test15(i64* %ptr) {
 ; PPC64LE-NEXT:    sync
 ; PPC64LE-NEXT:    ori 2, 2, 0
 ; PPC64LE-NEXT:    ld 3, 0(3)
-; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    bne- 7, .+4
+; PPC64LE-NEXT:    isync
 ; PPC64LE-NEXT:    blr
   %val = load atomic i64, i64* %ptr seq_cst, align 8
   ret i64 %val
@@ -9544,3 +9560,35 @@ define i64 @test559(i64* %ptr, i64 %val) {
   %ret = atomicrmw umin i64* %ptr, i64 %val singlethread seq_cst
   ret i64 %ret
 }
+
+; The second load should never be scheduled before isync.
+define i32 @test_ordering0(i32* %ptr1, i32* %ptr2) {
+; PPC64LE-LABEL: test_ordering0:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwz 4, 0(3)
+; PPC64LE-NEXT:    cmpw 7, 4, 4
+; PPC64LE-NEXT:    bne- 7, .+4
+; PPC64LE-NEXT:    isync
+; PPC64LE-NEXT:    lwz 3, 0(3)
+; PPC64LE-NEXT:    add 3, 4, 3
+; PPC64LE-NEXT:    blr
+  %val1 = load atomic i32, i32* %ptr1 acquire, align 4
+  %val2 = load i32, i32* %ptr1
+  %add = add i32 %val1, %val2
+  ret i32 %add
+}
+
+; The second store should never be scheduled before isync.
+define i32 @test_ordering1(i32* %ptr1, i32 %val1, i32* %ptr2) {
+; PPC64LE-LABEL: test_ordering1:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwz 3, 0(3)
+; PPC64LE-NEXT:    cmpw 7, 3, 3
+; PPC64LE-NEXT:    bne- 7, .+4
+; PPC64LE-NEXT:    isync
+; PPC64LE-NEXT:    stw 4, 0(5)
+; PPC64LE-NEXT:    blr
+  %val2 = load atomic i32, i32* %ptr1 acquire, align 4
+  store i32 %val1, i32* %ptr2
+  ret i32 %val2
+}
diff --git a/test/CodeGen/PowerPC/atomics.ll b/test/CodeGen/PowerPC/atomics.ll
index 2e1eff0f634d..61d54534f5fd 100644
--- a/test/CodeGen/PowerPC/atomics.ll
+++ b/test/CodeGen/PowerPC/atomics.ll
@@ -25,9 +25,12 @@ define i16 @load_i16_monotonic(i16* %mem) {
 }
 define i32 @load_i32_acquire(i32* %mem) {
 ; CHECK-LABEL: load_i32_acquire
-; CHECK: lwz
+; CHECK: lwz [[VAL:r[0-9]+]]
   %val = load atomic i32, i32* %mem acquire, align 4
-; CHECK: lwsync
+; CHECK-PPC32: lwsync
+; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
+; CHECK-PPC64: bne- [[CR]], .+4
+; CHECK-PPC64: isync
   ret i32 %val
 }
 define i64 @load_i64_seq_cst(i64* %mem) {
@@ -35,9 +38,12 @@ define i64 @load_i64_seq_cst(i64* %mem) {
 ; CHECK: sync
 ; PPC32: __sync_
 ; PPC64-NOT: __sync_
-; PPC64: ld
+; PPC64: ld [[VAL:r[0-9]+]]
   %val = load atomic i64, i64* %mem seq_cst, align 8
-; CHECK: lwsync
+; CHECK-PPC32: lwsync
+; CHECK-PPC64: cmpw [[CR:cr[0-9]+]], [[VAL]], [[VAL]]
+; CHECK-PPC64: bne- [[CR]], .+4
+; CHECK-PPC64: isync
   ret i64 %val
 }
 
diff --git a/test/CodeGen/PowerPC/ppcf128sf.ll b/test/CodeGen/PowerPC/ppcf128sf.ll
index 6804b551e572..fde7d48da7c2 100644
--- a/test/CodeGen/PowerPC/ppcf128sf.ll
+++ b/test/CodeGen/PowerPC/ppcf128sf.ll
@@ -14,19 +14,19 @@ entry:
   %0 = load ppc_fp128, ppc_fp128* @ld, align 16
   %1 = load ppc_fp128, ppc_fp128* @ld2, align 16
   %add = fadd ppc_fp128 %0, %1
-  store ppc_fp128 %add, ppc_fp128* %c, align 16
+  store volatile ppc_fp128 %add, ppc_fp128* %c, align 16
   %2 = load ppc_fp128, ppc_fp128* @ld, align 16
   %3 = load ppc_fp128, ppc_fp128* @ld2, align 16
   %sub = fsub ppc_fp128 %2, %3
-  store ppc_fp128 %sub, ppc_fp128* %c, align 16
+  store volatile ppc_fp128 %sub, ppc_fp128* %c, align 16
   %4 = load ppc_fp128, ppc_fp128* @ld, align 16
   %5 = load ppc_fp128, ppc_fp128* @ld2, align 16
   %mul = fmul ppc_fp128 %4, %5
-  store ppc_fp128 %mul, ppc_fp128* %c, align 16
+  store volatile ppc_fp128 %mul, ppc_fp128* %c, align 16
   %6 = load ppc_fp128, ppc_fp128* @ld, align 16
   %7 = load ppc_fp128, ppc_fp128* @ld2, align 16
   %div = fdiv ppc_fp128 %6, %7
-  store ppc_fp128 %div, ppc_fp128* %c, align 16
+  store volatile ppc_fp128 %div, ppc_fp128* %c, align 16
   ret void
 
   ; CHECK-LABEL:    __gcc_qadd
diff --git a/test/CodeGen/PowerPC/save-bp.ll b/test/CodeGen/PowerPC/save-bp.ll
new file mode 100644
index 000000000000..1c7e19a1d5cb
--- /dev/null
+++ b/test/CodeGen/PowerPC/save-bp.ll
@@ -0,0 +1,54 @@
+; RUN: llc -march=ppc64 -ppc-always-use-base-pointer < %s | FileCheck %s --check-prefix CHECK --check-prefix PPC64
+; RUN: llc -march=ppc32 -ppc-always-use-base-pointer < %s | FileCheck %s --check-prefix CHECK --check-prefix PPC32
+; RUN: llc -march=ppc32 -ppc-always-use-base-pointer -relocation-model pic < %s | FileCheck %s --check-prefix CHECK --check-prefix PPC32PIC
+
+; CHECK-LABEL: fred:
+
+; Check for saving/restoring frame pointer (X31) and base pointer (X30)
+; on ppc64:
+; PPC64: std 31, -8(1)
+; PPC64: std 30, -16(1)
+; PPC64: ld 31, -8(1)
+; PPC64: ld 30, -16(1)
+
+; Check for saving/restoring frame pointer (R31) and base pointer (R30)
+; on ppc32:
+; PPC32: stwux 1, 1, 0
+; PPC32; addic 0, 0, -4
+; PPC32: stwx 31, 0, 0
+; PPC32: addic 0, 0, -4
+; PPC32: stwx 30, 0, 0
+; The restore sequence:
+; PPC32: lwz 31, 0(1)
+; PPC32: addic 30, 0, 8
+; PPC32: lwz 0, -4(31)
+; PPC32: lwz 30, -8(31)
+; PPC32: mr 1, 31
+; PPC32: mr 31, 0
+
+; Check for saving/restoring frame pointer (R31) and base pointer (R29)
+; on ppc32/pic. This is mostly the same as without pic, except that base
+; pointer is in R29.
+; PPC32PIC: stwux 1, 1, 0
+; PPC32PIC; addic 0, 0, -4
+; PPC32PIC: stwx 31, 0, 0
+; PPC32PIC: addic 0, 0, -8
+; PPC32PIC: stwx 29, 0, 0
+; The restore sequence:
+; PPC32PIC: lwz 31, 0(1)
+; PPC32PIC: addic 29, 0, 12
+; PPC32PIC: lwz 0, -4(31)
+; PPC32PIC: lwz 29, -12(31)
+; PPC32PIC: mr 1, 31
+; PPC32PIC: mr 31, 0
+
+
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-unknown-freebsd"
+
+define i64 @fred() local_unnamed_addr #0 {
+entry:
+  ret i64 0
+}
+
+attributes #0 = { norecurse readnone nounwind sspstrong "no-frame-pointer-elim"="true" "target-cpu"="ppc" }
diff --git a/test/CodeGen/PowerPC/save-cr-ppc32svr4.ll b/test/CodeGen/PowerPC/save-cr-ppc32svr4.ll
new file mode 100644
index 000000000000..9fabca186050
--- /dev/null
+++ b/test/CodeGen/PowerPC/save-cr-ppc32svr4.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=ppc32 -relocation-model pic < %s | FileCheck %s
+;
+; Make sure that the CR register is saved correctly on PPC32/SVR4.
+
+; CHECK-LABEL: fred:
+; CHECK: stwu 1, -32(1)
+; CHECK: stw 31, 28(1)
+; CHECK: mr 31, 1
+; CHECK: stw 30, 24(1)
+; CHECK: mfcr [[CR:[0-9]+]]
+; CHECK: stw [[CR]], 20(31)
+
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-unknown-freebsd"
+
+; Function Attrs: norecurse nounwind readnone sspstrong
+define i64 @fred(double %a0) local_unnamed_addr #0 {
+b1:
+  %v2 = fcmp olt double %a0, 0x43E0000000000000
+  br i1 %v2, label %b3, label %b7
+
+b3:                                               ; preds = %b1
+  %v4 = fcmp olt double %a0, 0xC3E0000000000000
+  %v5 = fptosi double %a0 to i64
+  %v6 = select i1 %v4, i64 -9223372036854775808, i64 %v5
+  br label %b14
+
+b7:                                               ; preds = %b1
+  %v8 = fcmp olt double %a0, 0x43F0000000000000
+  br i1 %v8, label %b9, label %b11
+
+b9:                                               ; preds = %b7
+  %v10 = fptoui double %a0 to i64
+  br label %b14
+
+b11:                                              ; preds = %b7
+  %v12 = fcmp ogt double %a0, 0.000000e+00
+  %v13 = sext i1 %v12 to i64
+  br label %b14
+
+b14:                                              ; preds = %b11, %b9, %b3
+  %v15 = phi i64 [ %v6, %b3 ], [ %v10, %b9 ], [ %v13, %b11 ]
+  ret i64 %v15
+}
+
+attributes #0 = { norecurse nounwind readnone sspstrong "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "stack-protector-buffer-size"="8" "target-cpu"="ppc" }
diff --git a/test/CodeGen/PowerPC/save-crbp-ppc32svr4.ll b/test/CodeGen/PowerPC/save-crbp-ppc32svr4.ll
new file mode 100644
index 000000000000..b7b3c1ada965
--- /dev/null
+++ b/test/CodeGen/PowerPC/save-crbp-ppc32svr4.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=ppc32 -relocation-model pic < %s | FileCheck %s
+
+; CHECK-LABEL: fred
+; CHECK: stwux 1, 1, 0
+; Save R31..R29 via R0:
+; CHECK: addic 0, 0, -4
+; CHECK: stwx 31, 0, 0
+; CHECK: addic 0, 0, -4
+; CHECK: stwx 30, 0, 0
+; CHECK: addic 0, 0, -4
+; CHECK: stwx 29, 0, 0
+; Set R29 back to the value of R0 from before the updates:
+; CHECK: addic 29, 0, 12
+; Save CR through R12 using R29 as the stack pointer (aligned base pointer).
+; CHECK: mfcr 12
+; CHECK: stw 28, -16(29)
+; CHECK: stw 12, -20(29)
+
+target datalayout = "E-m:e-p:32:32-i64:64-n32"
+target triple = "powerpc-unknown-freebsd"
+
+; Function Attrs: norecurse readnone sspstrong
+define i64 @fred(double %a0) local_unnamed_addr #0 {
+b1:
+  %v2 = alloca i64, align 128
+  store i64 0, i64* %v2
+  %v3 = fcmp olt double %a0, 0x43E0000000000000
+  br i1 %v3, label %b4, label %b8
+
+b4:                                               ; preds = %b1
+  %v5 = fcmp olt double %a0, 0xC3E0000000000000
+  %v6 = fptosi double %a0 to i64
+  store i64 %v6, i64* %v2
+  %v7 = select i1 %v5, i64 -9223372036854775808, i64 %v6
+  br label %b15
+
+b8:                                               ; preds = %b1
+  %v9 = fcmp olt double %a0, 0x43F0000000000000
+  br i1 %v9, label %b10, label %b12
+
+b10:                                              ; preds = %b8
+  %v11 = fptoui double %a0 to i64
+  br label %b15
+
+b12:                                              ; preds = %b8
+  %v13 = fcmp ogt double %a0, 0.000000e+00
+  %v14 = sext i1 %v13 to i64
+  br label %b15
+
+b15:                                              ; preds = %b12, %b10, %b4
+  %v16 = phi i64 [ %v7, %b4 ], [ %v11, %b10 ], [ %v14, %b12 ]
+  %v17 = load i64, i64* %v2
+  %v18 = add i64 %v17, %v16
+  ret i64 %v18
+}
+
+attributes #0 = { norecurse readnone sspstrong "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "stack-protector-buffer-size"="8" "target-cpu"="ppc" }
diff --git a/test/CodeGen/SPARC/32abi.ll b/test/CodeGen/SPARC/32abi.ll
index 09e7a3a09d86..3807f84d4e92 100644
--- a/test/CodeGen/SPARC/32abi.ll
+++ b/test/CodeGen/SPARC/32abi.ll
@@ -25,17 +25,17 @@ define void @intarg(i8  %a0,   ; %i0
                     i32 %a5,   ; %i5
                     i32 signext %a6,   ; [%fp+92]
                     i8* %a7) { ; [%fp+96]
-  store i8 %a0, i8* %a4
-  store i8 %a1, i8* %a4
+  store volatile i8 %a0, i8* %a4
+  store volatile i8 %a1, i8* %a4
   %p16 = bitcast i8* %a4 to i16*
-  store i16 %a2, i16* %p16
+  store volatile i16 %a2, i16* %p16
   %p32 = bitcast i8* %a4 to i32*
-  store i32 %a3, i32* %p32
+  store volatile i32 %a3, i32* %p32
   %pp = bitcast i8* %a4 to i8**
-  store i8* %a4, i8** %pp
-  store i32 %a5, i32* %p32
-  store i32 %a6, i32* %p32
-  store i8* %a7, i8** %pp
+  store volatile i8* %a4, i8** %pp
+  store volatile i32 %a5, i32* %p32
+  store volatile i32 %a6, i32* %p32
+  store volatile i8* %a7, i8** %pp
   ret void
 }
 
diff --git a/test/CodeGen/SPARC/64abi.ll b/test/CodeGen/SPARC/64abi.ll
index b963be2e9853..771cc409554b 100644
--- a/test/CodeGen/SPARC/64abi.ll
+++ b/test/CodeGen/SPARC/64abi.ll
@@ -24,17 +24,17 @@ define void @intarg(i8  %a0,   ; %i0
                     i32 %a5,   ; %i5
                     i32 signext %a6,   ; [%fp+BIAS+176]
                     i8* %a7) { ; [%fp+BIAS+184]
-  store i8 %a0, i8* %a4
-  store i8 %a1, i8* %a4
+  store volatile i8 %a0, i8* %a4
+  store volatile i8 %a1, i8* %a4
   %p16 = bitcast i8* %a4 to i16*
-  store i16 %a2, i16* %p16
+  store volatile i16 %a2, i16* %p16
   %p32 = bitcast i8* %a4 to i32*
-  store i32 %a3, i32* %p32
+  store volatile i32 %a3, i32* %p32
   %pp = bitcast i8* %a4 to i8**
-  store i8* %a4, i8** %pp
-  store i32 %a5, i32* %p32
-  store i32 %a6, i32* %p32
-  store i8* %a7, i8** %pp
+  store volatile i8* %a4, i8** %pp
+  store volatile i32 %a5, i32* %p32
+  store volatile i32 %a6, i32* %p32
+  store volatile i8* %a7, i8** %pp
   ret void
 }
 
@@ -316,7 +316,7 @@ define void @call_ret_i64_pair(i64* %i0) {
   %rv = call { i64, i64 } @ret_i64_pair(i32 undef, i32 undef,
                                         i64* undef, i64* undef)
   %e0 = extractvalue { i64, i64 } %rv, 0
-  store i64 %e0, i64* %i0
+  store volatile i64 %e0, i64* %i0
   %e1 = extractvalue { i64, i64 } %rv, 1
   store i64 %e1, i64* %i0
   ret void
diff --git a/test/CodeGen/SystemZ/swift-return.ll b/test/CodeGen/SystemZ/swift-return.ll
index 69d0e979190c..977816f66bec 100644
--- a/test/CodeGen/SystemZ/swift-return.ll
+++ b/test/CodeGen/SystemZ/swift-return.ll
@@ -189,11 +189,11 @@ define void @consume_i1_ret() {
   %v6 = extractvalue { i1, i1, i1, i1 } %call, 2
   %v7 = extractvalue { i1, i1, i1, i1 } %call, 3
   %val = zext i1 %v3 to i32
-  store i32 %val, i32* @var
+  store volatile i32 %val, i32* @var
   %val2 = zext i1 %v5 to i32
-  store i32 %val2, i32* @var
+  store volatile i32 %val2, i32* @var
   %val3 = zext i1 %v6 to i32
-  store i32 %val3, i32* @var
+  store volatile i32 %val3, i32* @var
   %val4 = zext i1 %v7 to i32
   store i32 %val4, i32* @var
   ret void
diff --git a/test/CodeGen/Thumb/stack-access.ll b/test/CodeGen/Thumb/stack-access.ll
index 44217aba62d5..533559a67421 100644
--- a/test/CodeGen/Thumb/stack-access.ll
+++ b/test/CodeGen/Thumb/stack-access.ll
@@ -7,13 +7,13 @@ define void @test1(i8** %p) {
   %z = alloca i8, align 1
 ; CHECK: add r1, sp, #8
 ; CHECK: str r1, [r0]
-  store i8* %x, i8** %p, align 4
+  store volatile i8* %x, i8** %p, align 4
 ; CHECK: add r1, sp, #4
 ; CHECK: str r1, [r0]
-  store i8* %y, i8** %p, align 4
+  store volatile i8* %y, i8** %p, align 4
 ; CHECK: mov r1, sp
 ; CHECK: str r1, [r0]
-  store i8* %z, i8** %p, align 4
+  store volatile i8* %z, i8** %p, align 4
   ret void
 }
 
@@ -24,10 +24,10 @@ define void @test2([1024 x i8]** %p) {
 ; CHECK: add r1, sp, #1020
 ; CHECK: adds r1, #4
 ; CHECK: str r1, [r0]
-  store [1024 x i8]* %arr1, [1024 x i8]** %p, align 4
+  store volatile [1024 x i8]* %arr1, [1024 x i8]** %p, align 4
 ; CHECK: mov r1, sp
 ; CHECK: str r1, [r0]
-  store [1024 x i8]* %arr2, [1024 x i8]** %p, align 4
+  store volatile [1024 x i8]* %arr2, [1024 x i8]** %p, align 4
   ret void
 }
 
diff --git a/test/CodeGen/Thumb2/ldr-str-imm12.ll b/test/CodeGen/Thumb2/ldr-str-imm12.ll
index 3e4bd02097ad..c6d00d4c1e11 100644
--- a/test/CodeGen/Thumb2/ldr-str-imm12.ll
+++ b/test/CodeGen/Thumb2/ldr-str-imm12.ll
@@ -50,9 +50,9 @@ bb420:                                            ; preds = %bb20, %bb20
 ; CHECK: str{{(.w)?}} r{{[0-9]+}}, [sp
 ; CHECK: str{{(.w)?}} r{{[0-9]+}}, [sp
 ; CHECK: str{{(.w)?}} r{{[0-9]+}}, [sp
-  store %union.rec* null, %union.rec** @zz_hold, align 4
+  store volatile %union.rec* null, %union.rec** @zz_hold, align 4
   store %union.rec* null, %union.rec** @zz_res, align 4
-  store %union.rec* %x, %union.rec** @zz_hold, align 4
+  store volatile %union.rec* %x, %union.rec** @zz_hold, align 4
   %0 = call  %union.rec* @Manifest(%union.rec* undef, %union.rec* %env, %struct.STYLE* %style, %union.rec** %bthr, %union.rec** %fthr, %union.rec** %target, %union.rec** %crs, i32 %ok, i32 %need_expand, %union.rec** %enclose, i32 %fcr) nounwind ; <%union.rec*> [#uses=0]
   unreachable
 
diff --git a/test/CodeGen/X86/GlobalISel/add-scalar.ll b/test/CodeGen/X86/GlobalISel/add-scalar.ll
index 553bc2789ff0..85db1c0e7e7a 100644
--- a/test/CodeGen/X86/GlobalISel/add-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll
@@ -1,44 +1,94 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc -mtriple=i386-linux-gnu   -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
 
 define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
-; ALL-LABEL: test_add_i64:
-; ALL:       # BB#0:
-; ALL-NEXT:    leaq (%rsi,%rdi), %rax
-; ALL-NEXT:    retq
+; X64-LABEL: test_add_i64:
+; X64:       # BB#0:
+; X64-NEXT:    leaq (%rsi,%rdi), %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_add_i64:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:  .Lcfi0:
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:  .Lcfi1:
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:  .Lcfi2:
+; X32-NEXT:    .cfi_def_cfa_register %ebp
+; X32-NEXT:    pushl %esi
+; X32-NEXT:  .Lcfi3:
+; X32-NEXT:    .cfi_offset %esi, -12
+; X32-NEXT:    leal 8(%ebp), %ecx
+; X32-NEXT:    leal 12(%ebp), %esi
+; X32-NEXT:    leal 16(%ebp), %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    leal 20(%ebp), %edx
+; X32-NEXT:    movl (%edx), %edx
+; X32-NEXT:    addl (%ecx), %eax
+; X32-NEXT:    adcl (%esi), %edx
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
   %ret = add i64 %arg1, %arg2
   ret i64 %ret
 }
 
 define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
-; ALL-LABEL: test_add_i32:
-; ALL:       # BB#0:
-; ALL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; ALL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
-; ALL-NEXT:    leal (%rsi,%rdi), %eax
-; ALL-NEXT:    retq
+; X64-LABEL: test_add_i32:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; X64-NEXT:    leal (%rsi,%rdi), %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_add_i32:
+; X32:       # BB#0:
+; X32-NEXT:    leal 4(%esp), %ecx
+; X32-NEXT:    leal 8(%esp), %eax
+; X32-NEXT:    movl (%eax), %eax
+; X32-NEXT:    addl (%ecx), %eax
+; X32-NEXT:    retl
   %ret = add i32 %arg1, %arg2
   ret i32 %ret
 }
 
 define i16 @test_add_i16(i16 %arg1, i16 %arg2) {
-; ALL-LABEL: test_add_i16:
-; ALL:       # BB#0:
-; ALL-NEXT:    # kill: %DI<def> %DI<kill> %RDI<def>
-; ALL-NEXT:    # kill: %SI<def> %SI<kill> %RSI<def>
-; ALL-NEXT:    leal (%rsi,%rdi), %eax
-; ALL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; ALL-NEXT:    retq
+; X64-LABEL: test_add_i16:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %DI<def> %DI<kill> %RDI<def>
+; X64-NEXT:    # kill: %SI<def> %SI<kill> %RSI<def>
+; X64-NEXT:    leal (%rsi,%rdi), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_add_i16:
+; X32:       # BB#0:
+; X32-NEXT:    leal 4(%esp), %ecx
+; X32-NEXT:    leal 8(%esp), %eax
+; X32-NEXT:    movzwl (%eax), %eax
+; X32-NEXT:    addw (%ecx), %ax
+; X32-NEXT:    retl
   %ret = add i16 %arg1, %arg2
   ret i16 %ret
 }
 
 define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
-; ALL-LABEL: test_add_i8:
-; ALL:       # BB#0:
-; ALL-NEXT:    addb %dil, %sil
-; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    retq
+; X64-LABEL: test_add_i8:
+; X64:       # BB#0:
+; X64-NEXT:    addb %dil, %sil
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    retq
+;
+; X32-LABEL: test_add_i8:
+; X32:       # BB#0:
+; X32-NEXT:    leal 4(%esp), %ecx
+; X32-NEXT:    leal 8(%esp), %eax
+; X32-NEXT:    movb (%eax), %al
+; X32-NEXT:    addb (%ecx), %al
+; X32-NEXT:    retl
   %ret = add i8 %arg1, %arg2
   ret i8 %ret
 }
diff --git a/test/CodeGen/X86/GlobalISel/legalize-add.mir b/test/CodeGen/X86/GlobalISel/legalize-add.mir
index 22619cc71033..6a03388da947 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-add.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-add.mir
@@ -1,40 +1,67 @@
-# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
-
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu   -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
 --- |
-  ; ModuleID = '<stdin>'
-  source_filename = "<stdin>"
-  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-  target triple = "x86_64--linux-gnu"
+  define void @test_add_i32() {
+    ret void
+  }
 
-  define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
-    %ret = add i32 %arg1, %arg2
-    ret i32 %ret
+  define void @test_add_i64() {
+    ret void
   }
 
 ...
 ---
 name:            test_add_i32
+# ALL-LABEL: name:  test_add_i32
 alignment:       4
 legalized:       false
 regBankSelected: false
-selected:        false
-tracksRegLiveness: true
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
   - { id: 2, class: _ }
+# ALL:          %0(s32) = IMPLICIT_DEF
+# ALL-NEXT:     %1(s32) = IMPLICIT_DEF
+# ALL-NEXT:     %2(s32) = G_ADD %0, %1
+# ALL-NEXT:     RET 0
 body:             |
   bb.1 (%ir-block.0):
-    liveins: %edi, %esi
-    ;  CHECK-LABEL: name: test_add_i32
-    ;  CHECK: [[VAL1:%.*]](s32) = COPY %edi
-    ;  CHECK: [[VAL2:%.*]](s32) = COPY %esi
-    ;  CHECK: [[RES:%.*]](s32) = G_ADD [[VAL1:%.*]], [[VAL2:%.*]]
-
-    %0(s32) = COPY %edi
-    %1(s32) = COPY %esi
+    %0(s32) = IMPLICIT_DEF
+    %1(s32) = IMPLICIT_DEF
     %2(s32) = G_ADD %0, %1
-    %eax = COPY %2(s32)
-    RET 0, implicit %eax
+    RET 0
+
+...
+---
+name:            test_add_i64
+# ALL-LABEL: name:  test_add_i64
+alignment:       4
+legalized:       false
+regBankSelected: false
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+# X64:          %0(s64) = IMPLICIT_DEF
+# X64-NEXT:     %1(s64) = IMPLICIT_DEF
+# X64-NEXT:     %2(s64) = G_ADD %0, %1
+# X64-NEXT:     RET 0
+#
+# X32:          %0(s64) = IMPLICIT_DEF
+# X32-NEXT:     %1(s64) = IMPLICIT_DEF
+# X32-NEXT:     %3(s32), %4(s32) = G_UNMERGE_VALUES %0(s64)
+# X32-NEXT:     %5(s32), %6(s32) = G_UNMERGE_VALUES %1(s64)
+# X32-NEXT:     %12(s8) = G_CONSTANT i8 0
+# X32-NEXT:     %7(s1) = G_TRUNC %12(s8)
+# X32-NEXT:     %8(s32), %9(s1) = G_UADDE %3, %5, %7
+# X32-NEXT:     %10(s32), %11(s1) = G_UADDE %4, %6, %9
+# X32-NEXT:     %2(s64) = G_MERGE_VALUES %8(s32), %10(s32)
+# X32-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s64) = IMPLICIT_DEF
+    %1(s64) = IMPLICIT_DEF
+    %2(s64) = G_ADD %0, %1
+    RET 0
 
 ...
diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir b/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir
new file mode 100644
index 000000000000..a115d1fa3255
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/regbankselect-X32.mir
@@ -0,0 +1,36 @@
+# RUN: llc -mtriple=i386-linux-gnu -global-isel                       -run-pass=regbankselect %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -regbankselect-greedy -run-pass=regbankselect %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=GREEDY
+
+--- |
+  define void @test_uadde_i32() {
+    ret void
+  }
+
+...
+---
+name:            test_uadde_i32
+# CHECK-LABEL: name:  test_uadde_i32
+alignment:       4
+legalized:       true
+regBankSelected: false
+# CHECK:      registers:
+# CHECK-NEXT:   - { id: 0, class: gpr }
+# CHECK-NEXT:   - { id: 1, class: gpr }
+# CHECK-NEXT:   - { id: 2, class: gpr }
+# CHECK-NEXT:   - { id: 3, class: gpr }
+# CHECK-NEXT:   - { id: 4, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+body:             |
+  bb.0 (%ir-block.0):
+    %0(s32) = IMPLICIT_DEF
+    %1(s32) = IMPLICIT_DEF
+    %2(s1) = IMPLICIT_DEF
+    %3(s32), %4(s1) = G_UADDE %0, %1, %2
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-add-x32.mir b/test/CodeGen/X86/GlobalISel/select-add-x32.mir
new file mode 100644
index 000000000000..8710aaa61a21
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-add-x32.mir
@@ -0,0 +1,63 @@
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=X32
+--- |
+  define i64 @test_add_i64(i64 %a, i64 %b) {
+    %r = add i64 %a, %b
+    ret i64 %r
+  }
+
+...
+---
+name:            test_add_i64
+# X32-LABEL: name:  test_add_i64
+alignment:       4
+legalized:       true
+regBankSelected: true
+# X32:      registers:
+# X32-NEXT:   - { id: 0, class: gr32 }
+# X32-NEXT:   - { id: 1, class: gr32 }
+# X32-NEXT:   - { id: 2, class: gr32 }
+# X32-NEXT:   - { id: 3, class: gr32 }
+# X32-NEXT:   - { id: 4, class: gpr }
+# X32-NEXT:   - { id: 5, class: gr32 }
+# X32-NEXT:   - { id: 6, class: gr32 }
+# X32-NEXT:   - { id: 7, class: gr32 }
+# X32-NEXT:   - { id: 8, class: gr32 }
+# X32-NEXT:   - { id: 9, class: gpr }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: gpr }
+  - { id: 5, class: gpr }
+  - { id: 6, class: gpr }
+  - { id: 7, class: gpr }
+  - { id: 8, class: gpr }
+  - { id: 9, class: gpr }
+# X32:          %0 = IMPLICIT_DEF
+# X32-NEXT:     %1 = IMPLICIT_DEF
+# X32-NEXT:     %2 = IMPLICIT_DEF
+# X32-NEXT:     %3 = IMPLICIT_DEF
+# X32-NEXT:     %5 = ADD32rr %0, %2, implicit-def %eflags
+# X32-NEXT:     %6 = COPY %eflags
+# X32-NEXT:     %eflags = COPY %6
+# X32-NEXT:     %7 = ADC32rr %1, %3, implicit-def %eflags, implicit %eflags
+# X32-NEXT:     %8 = COPY %eflags
+# X32-NEXT:     %eax = COPY %5
+# X32-NEXT:     %edx = COPY %7
+# X32-NEXT:     RET 0, implicit %eax, implicit %edx
+body:             |
+  bb.0 (%ir-block.0):
+    %0(s32) = IMPLICIT_DEF
+    %1(s32) = IMPLICIT_DEF
+    %2(s32) = IMPLICIT_DEF
+    %3(s32) = IMPLICIT_DEF
+    %9(s8) = G_CONSTANT i8 0
+    %4(s1) = G_TRUNC %9(s8)
+    %5(s32), %6(s1) = G_UADDE %0, %2, %4
+    %7(s32), %8(s1) = G_UADDE %1, %3, %6
+    %eax = COPY %5(s32)
+    %edx = COPY %7(s32)
+    RET 0, implicit %eax, implicit %edx
+
+...
diff --git a/test/CodeGen/X86/arg-copy-elide.ll b/test/CodeGen/X86/arg-copy-elide.ll
index b9a2eeeb7f8f..126f5a1c7976 100644
--- a/test/CodeGen/X86/arg-copy-elide.ll
+++ b/test/CodeGen/X86/arg-copy-elide.ll
@@ -253,9 +253,7 @@ entry:
 ; CHECK: calll _addrof_i32
 ; CHECK: retl
 
-
 ; Don't elide the copy when the alloca is escaped with a store.
-
 define void @escape_with_store(i32 %x) {
   %x1 = alloca i32
   %x2 = alloca i32*
@@ -268,9 +266,8 @@ define void @escape_with_store(i32 %x) {
 }
 
 ; CHECK-LABEL: _escape_with_store:
-; CHECK-DAG: movl {{.*}}(%esp), %[[reg:[^ ]*]]
-; CHECK-DAG: movl $0, [[offs:[0-9]*]](%esp)
-; CHECK: movl %[[reg]], [[offs]](%esp)
+; CHECK: movl {{.*}}(%esp), %[[reg:[^ ]*]]
+; CHECK: movl %[[reg]], [[offs:[0-9]*]](%esp)
 ; CHECK: calll _addrof_i32
 
 
diff --git a/test/CodeGen/X86/leaFixup32.mir b/test/CodeGen/X86/leaFixup32.mir
deleted file mode 100644
index 70aac21c7ff2..000000000000
--- a/test/CodeGen/X86/leaFixup32.mir
+++ /dev/null
@@ -1,508 +0,0 @@
-# RUN: llc -run-pass x86-fixup-LEAs -mcpu=corei7-avx -o - %s | FileCheck %s
---- |
-  ; ModuleID = 'test/CodeGen/X86/fixup-lea.ll'
-  source_filename = "test/CodeGen/X86/fixup-lea.ll"
-  target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-  target triple = "i386"
-  ;generated using: llc -stop-after x86-pad-short-functions fixup-lea.ll > leaFinxup32.mir
-
-  ;test2add_32: 3 operands LEA32r that can be replaced with 2 add instructions
-  ; where ADD32ri8 is chosen
-  define i32 @test2add_32() {
-    ret i32 0
-  }
-
-  ;test2add_ebp_32: 3 operands LEA32r that can be replaced with 2 add instructions
-  ; where the base is rbp/r13/ebp register
-  define i32 @test2add_ebp_32() {
-    ret i32 0
-  }
-
-  ;test1add_ebp_32: 2 operands LEA32r where base register is ebp and can be replaced
-  ; with an add instruction
-  define i32 @test1add_ebp_32() {
-    ret i32 0
-  }
-
-  ;testleaadd_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
-  define i32 @testleaadd_32() {
-    ret i32 0
-  }
-
-  ;testleaadd_ebp_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
-  ; where the base is ebp register
-  define i32 @testleaadd_ebp_32() {
-    ret i32 0
-  }
-
-  ;test1lea_ebp_32: 2 operands LEA32r wher base register is rbp/r13/ebp and can be replaced
-  ; with a lea instruction
-  define i32 @test1lea_ebp_32() {
-    ret i32 0
-  }
- 
-  ;test2addi32_32: 3 operands LEA32r that can be replaced with 2 add instructions where ADD32ri32
-  ; is chosen
-  define i32 @test2addi32_32() {
-    ret i32 0
-  }
- 
-  ;test1mov1add_ebp_32: 2 operands LEA32r that can be replaced with 1 add 1 mov instructions
-  ; where the base is rbp/r13/ebp register
-  define i32 @test1mov1add_ebp_32() {
-    ret i32 0
-  }
-
-  ;testleaadd_ebp_index_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
-  ; where the base and the index are ebp register and there is offset
-  define i32 @testleaadd_ebp_index_32() {
-    ret i32 0
-  }
-
-  ;testleaadd_ebp_index2_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions
-  ; where the base and the index are ebp register and there is scale
-  define i32 @testleaadd_ebp_index2_32() {
-    ret i32 0
-  }
-  
-  ;test_skip_opt_32: 3 operands LEA32r that can not be replaced with 2 instructions
-  define i32 @test_skip_opt_32() {
-    ret i32 0
-  }
-
-  ;test_skip_eflags_32: LEA32r that cannot be replaced since its not safe to clobber eflags
-  define i32 @test_skip_eflags_32() {
-    ret i32 0
-  }
-
-...
----
-name:            test2add_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%eax' }
-  - { reg: '%ebp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %eax, %ebp
-    ; CHECK: %eax = ADD32rr %eax, killed %ebp
-    ; CHECK: %eax = ADD32ri8 %eax, -5
- 
-    %eax = LEA32r killed %eax, 1, killed %ebp, -5, _
-    RETQ %eax
-
-...
----
-name:            test2add_ebp_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%eax' }
-  - { reg: '%ebp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %eax, %ebp
-    ; CHECK: %ebp = ADD32rr %ebp, killed %eax
-    ; CHECK: %ebp = ADD32ri8 %ebp, -5
- 
-    %ebp = LEA32r killed %ebp, 1, killed %eax, -5, _
-    RETQ %ebp
-
-...
----
-name:            test1add_ebp_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%eax' }
-  - { reg: '%ebp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %eax, %ebp
-    ; CHECK: %ebp = ADD32rr %ebp, killed %eax
- 
-    %ebp = LEA32r killed %ebp, 1, killed %eax, 0, _
-    RETQ %ebp
-
-...
----
-name:            testleaadd_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%eax' }
-  - { reg: '%ebp' }
-  - { reg: '%ebx' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %eax, %ebp, %esi
-    ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0
-    ; CHECK: %ebx = ADD32ri8 %ebx, -5
- 
-    %ebx = LEA32r killed %eax, 1, killed %ebp, -5, _
-    RETQ %ebx
-
-...
----
-name:            testleaadd_ebp_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%eax' }
-  - { reg: '%ebp' }
-  - { reg: '%ebx' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %eax, %ebp
-    ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _
-    ; CHECK: %ebx = ADD32ri8  %ebx, -5
- 
-    %ebx = LEA32r killed %ebp, 1, killed %eax, -5, _
-    RETQ %ebx
-
-...
----
-name:            test1lea_ebp_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%eax' }
-  - { reg: '%ebp' }
-  - { reg: '%ebx' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %eax, %ebp
-    ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _
- 
-    %ebx = LEA32r killed %ebp, 1, killed %eax, 0, _
-    RETQ %ebx
-
-...
----
-name:            test2addi32_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%eax' }
-  - { reg: '%ebp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %eax, %ebp
-    ; CHECK: %eax = ADD32rr %eax, killed %ebp
-    ; CHECK: %eax = ADD32ri %eax, 129
- 
-    %eax = LEA32r killed %eax, 1, killed %ebp, 129, _
-    RETQ %eax
-
-...
----
-name:            test1mov1add_ebp_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%eax' }
-  - { reg: '%ebp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %eax, %ebp, %ebx
-    ; CHECK: %ebx = MOV32rr killed %ebp
-    ; CHECK: %ebx = ADD32rr %ebx, killed %ebp
- 
-    %ebx = LEA32r killed %ebp, 1, killed %ebp, 0, _
-    RETQ %ebx
-
-...
----
-name:            testleaadd_ebp_index_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%ebx' }
-  - { reg: '%ebp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %eax, %ebp, %ebx
-    ; CHECK: %ebx = LEA32r _, 1, killed %ebp, 5, _
-    ; CHECK: %ebx = ADD32rr %ebx, killed %ebp
- 
-    %ebx = LEA32r killed %ebp, 1, killed %ebp, 5, _
-    RETQ %ebx
-
-...
----
-name:            testleaadd_ebp_index2_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%ebx' }
-  - { reg: '%ebp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %eax, %ebp, %ebx
-    ; CHECK: %ebx = LEA32r _, 4, killed %ebp, 5, _
-    ; CHECK: %ebx = ADD32rr %ebx, killed %ebp
- 
-    %ebx = LEA32r killed %ebp, 4, killed %ebp, 5, _
-    RETQ %ebx
-
-...
----
-name:            test_skip_opt_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%ebx' }
-  - { reg: '%ebp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %eax, %ebp, %ebx
-    ; CHECK: %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _
- 
-    %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _
-    RETQ %ebp
-
-...
----
-name:            test_skip_eflags_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%ebp' }
-  - { reg: '%eax' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %eax, %ebp, %ebx
-    ; CHECK: %ebx = LEA32r killed %eax, 4, killed %eax, 5, _
-    ; CHECK: %ebp = LEA32r killed %ebx, 4, killed %ebx, 0, _
-    ; CHECK: %ebp = ADD32ri8 %ebp, 5
-   
-    CMP32rr   %eax, killed %ebx, implicit-def %eflags
-    %ebx = LEA32r killed %eax, 4, killed %eax, 5, _
-    JE_1 %bb.1, implicit %eflags
-    RETQ %ebx
-  bb.1:
-    liveins: %eax, %ebp, %ebx
-    %ebp = LEA32r killed %ebx, 4, killed %ebx, 5, _
-    RETQ %ebp
-
-...
-
-
-
diff --git a/test/CodeGen/X86/leaFixup64.mir b/test/CodeGen/X86/leaFixup64.mir
deleted file mode 100644
index 9b0058750598..000000000000
--- a/test/CodeGen/X86/leaFixup64.mir
+++ /dev/null
@@ -1,1041 +0,0 @@
-# RUN: llc -run-pass x86-fixup-LEAs -mcpu=corei7-avx -o - %s | FileCheck %s
---- |
-  ; ModuleID = 'lea-2.ll'
-  source_filename = "lea-2.ll"
-  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-  ;generated using: llc -stop-after x86-pad-short-functions lea-2.ll > leaFinxup64.mir
-
-  ;testleaadd_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions
-  ; but can be replaced with 1 lea + 1 add
-  define i32 @testleaadd_64_32_1() {
-    ret i32 0
-  }
-
-  ;testleaadd_rbp_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions
-  ; where the base is rbp/r13/ebp register but it can be replaced with 1 lea + 1 add
-  define i32 @testleaadd_rbp_64_32_1() {
-    ret i32 0
-  }
-
-  ;test1lea_rbp_64_32_1: 2 operands LEA64_32r where base register is rbp/r13/ebp and can not
-  ; be replaced with an add instruction but can be replaced with 1 lea instruction
-  define i32 @test1lea_rbp_64_32_1() {
-    ret i32 0
-  }
-
-  ;test2add_64: 3 operands LEA64r that can be replaced with 2 add instructions
-  define i32 @test2add_64() {
-    ret i32 0
-  }
-
-  ;test2add_rbp_64: 3 operands LEA64r that can be replaced with 2 add instructions
-  ; where the base is rbp/r13/ebp register
-  define i32 @test2add_rbp_64() {
-    ret i32 0
-  }
-
-  ;test1add_rbp_64: 2 operands LEA64r where base register is rbp/r13/ebp and can be replaced
-  ; with an add instruction
-  define i32 @test1add_rbp_64() {
-    ret i32 0
-  }
-
-  ;testleaadd_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions
-  define i32 @testleaadd_64_32() {
-    ret i32 0
-  }
-
-  ;testleaadd_rbp_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions
-  ; where the base is rbp/r13/ebp register
-  define i32 @testleaadd_rbp_64_32() {
-    ret i32 0
-  }
-
-  ;test1lea_rbp_64_32: 2 operands LEA64_32r where base register is rbp/r13/ebp and can be replaced
-  ; with a lea instruction
-  define i32 @test1lea_rbp_64_32() {
-    ret i32 0
-  }
-
-  ;testleaadd_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
-  define i32 @testleaadd_64() {
-    ret i32 0
-  }
-
-  ;testleaadd_rbp_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
-  ; where the base is rbp/r13/ebp register
-  define i32 @testleaadd_rbp_64() {
-    ret i32 0
-  }
-
-  ;test1lea_rbp_64: 2 operands LEA64r wher base register is rbp/r13/ebp and can be replaced
-  ; with a lea instruction
-  define i32 @test1lea_rbp_64() {
-    ret i32 0
-  }
-
-  ;test8: dst = base & scale!=1, can't optimize
-  define i32 @test8() {
-      ret i32 0
-  }
- 
-  ;testleaaddi32_64_32: 3 operands LEA64_32r that can be replaced with 1 lea + 1 add instructions where
-  ; ADD64ri32 is chosen
-  define i32 @testleaaddi32_64_32() {
-    ret i32 0
-  }
- 
-  ;test1mov1add_rbp_64_32: 2 operands LEA64_32r cannot be replaced with 1 add 1 mov instructions
-  ; where the base is rbp/r13/ebp register
-  define i32 @test1mov1add_rbp_64_32() {
-    ret i32 0
-  }
-
-  ;testleaadd_rbp_index_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions
-  ; where the base and the index are ebp register and there is offset
-  define i32 @testleaadd_rbp_index_64_32() {
-    ret i32 0
-  }
-
-  ;testleaadd_rbp_index2_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions
-  ; where the base and the index are ebp register and there is scale
-  define i32 @testleaadd_rbp_index2_64_32() {
-    ret i32 0
-  }
- 
-  ;test2addi32_64: 3 operands LEA64r that can be replaced with 2 add instructions where ADD64ri32
-  ; is chosen
-  define i32 @test2addi32_64() {
-    ret i32 0
-  }
- 
-  ;test1mov1add_rbp_64: 2 operands LEA64r that can be replaced with 1 add 1 mov instructions
-  ; where the base is rbp/r13/ebp register
-  define i32 @test1mov1add_rbp_64() {
-    ret i32 0
-  }
-
-  ;testleaadd_rbp_index_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
-  ; where the base and the index are ebp register and there is offset
-  define i32 @testleaadd_rbp_index_64() {
-    ret i32 0
-  }
-
-  ;testleaadd_rbp_index2_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions
-  ; where the base and the index are ebp register and there is scale
-  define i32 @testleaadd_rbp_index2_64() {
-    ret i32 0
-  }
-
-  ;test_skip_opt_64: 3 operands LEA64r that can not be replaced with 2 instructions
-  define i32 @test_skip_opt_64() {
-    ret i32 0
-  }
-
-  ;test_skip_eflags_64: LEA64r that cannot be replaced since its not safe to clobber eflags
-  define i32 @test_skip_eflags_64() {
-    ret i32 0
-  }
-
-  ;test_skip_opt_64_32: 3 operands LEA64_32r that can not be replaced with 2 instructions
-  define i32 @test_skip_opt_64_32() {
-    ret i32 0
-  }
-
-  ;test_skip_eflags_64_32: LEA64_32r that cannot be replaced since its not safe to clobber eflags
-  define i32 @test_skip_eflags_64_32() {
-    ret i32 0
-  }
-
-
-...
----
-name:            testleaadd_64_32_1
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0
-    ; CHECK: %eax = ADD32ri8 %eax, -5
- 
-    %eax = LEA64_32r killed %rax, 1, killed %rbp, -5, _
-    RETQ %eax
-
-...
----
-name:            testleaadd_rbp_64_32_1
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %ebp = LEA64_32r killed %rax, 1,  killed %rbp, 0
-    ; CHECK: %ebp = ADD32ri8 %ebp, -5
- 
-    %ebp = LEA64_32r killed %rbp, 1, killed %rax, -5, _
-    RETQ %ebp
-
-...
----
-name:            test1lea_rbp_64_32_1
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %ebp = LEA64_32r killed %rax, 1, killed %rbp, 0
- 
-    %ebp = LEA64_32r killed %rbp, 1, killed %rax, 0, _
-    RETQ %ebp
-
-...
----
-name:            test2add_64
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %rax = ADD64rr %rax, killed %rbp
-    ; CHECK: %rax = ADD64ri8 %rax, -5
- 
-    %rax = LEA64r killed %rax, 1, killed %rbp, -5, _
-    RETQ %eax
-
-...
----
-name:            test2add_rbp_64
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %rbp = ADD64rr %rbp, killed %rax
-    ; CHECK: %rbp = ADD64ri8 %rbp, -5
- 
-    %rbp = LEA64r killed %rbp, 1, killed %rax, -5, _
-    RETQ %ebp
-
-...
----
-name:            test1add_rbp_64
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %rbp = ADD64rr %rbp, killed %rax
- 
-    %rbp = LEA64r killed %rbp, 1, killed %rax, 0, _
-    RETQ %ebp
-
-...
----
-name:            testleaadd_64_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-  - { reg: '%rbx' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
-    ; CHECK: %ebx = ADD32ri8 %ebx, -5
- 
-    %ebx = LEA64_32r killed %rax, 1, killed %rbp, -5, _
-    RETQ %ebx
-
-...
----
-name:            testleaadd_rbp_64_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-  - { reg: '%rbx' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
-    ; CHECK: %ebx = ADD32ri8 %ebx, -5
- 
-    %ebx = LEA64_32r killed %rbp, 1, killed %rax, -5, _
-    RETQ %ebx
-
-...
----
-name:            test1lea_rbp_64_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-  - { reg: '%rbx' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
- 
-    %ebx = LEA64_32r killed %rbp, 1, killed %rax, 0, _
-    RETQ %ebx
-
-...
----
-name:            testleaadd_64
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-  - { reg: '%rbx' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
-    ; CHECK: %rbx = ADD64ri8 %rbx, -5
- 
-    %rbx = LEA64r killed %rax, 1, killed %rbp, -5, _
-    RETQ %ebx
-
-...
----
-name:            testleaadd_rbp_64
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-  - { reg: '%rbx' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
-    ; CHECK: %rbx = ADD64ri8 %rbx, -5
- 
-    %rbx = LEA64r killed %rbp, 1, killed %rax, -5, _
-    RETQ %ebx
-
-...
----
-name:            test1lea_rbp_64
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-  - { reg: '%rbx' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
- 
-    %rbx = LEA64r killed %rbp, 1, killed %rax, 0, _
-    RETQ %ebx
-
-...
----
-name:            test8
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rdi' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rdi, %rbp
-    ; CHECK:  %r12 = LEA64r _, 2, killed %r13, 5, _
-    ; CHECK:  %r12 = ADD64rr %r12, killed %rbp
-    %rbp = KILL %rbp, implicit-def %rbp
-    %r13 = KILL %rdi, implicit-def %r13
-    %r12 = LEA64r killed %rbp, 2, killed %r13, 5, _
-    RETQ %r12
-
-...
----
-name:            testleaaddi32_64_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0
-    ; CHECK: %eax = ADD32ri %eax, 129
- 
-    %eax = LEA64_32r killed %rax, 1, killed %rbp, 129, _
-    RETQ %eax
-
-...
----
-name:            test1mov1add_rbp_64_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp, %rbx
-    ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _
-
-    %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _
-    RETQ %ebx
-
-...
----
-name:            testleaadd_rbp_index_64_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rbx' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp, %rbx
-    ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _
- 
-    %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _
-    RETQ %ebx
-
-...
----
-name:            testleaadd_rbp_index2_64_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rbx' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %eax, %ebp, %ebx
-    ; CHECK: %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _
- 
-    %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _
-    RETQ %ebx
-
-...
----
-name:            test2addi32_64
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp
-    ; CHECK: %rax = ADD64rr %rax, killed %rbp
-    ; CHECK: %rax = ADD64ri32 %rax, 129
- 
-    %rax = LEA64r killed %rax, 1, killed %rbp, 129, _
-    RETQ %eax
-
-...
----
-name:            test1mov1add_rbp_64
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rax' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp, %rbx
-    ; CHECK: %rbx = MOV64rr killed %rbp
-    ; CHECK: %rbx = ADD64rr %rbx, killed %rbp
- 
-    %rbx = LEA64r killed %rbp, 1, killed %rbp, 0, _
-    RETQ %ebx
-
-...
----
-name:            testleaadd_rbp_index_64
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rbx' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp, %rbx
-    ; CHECK: %rbx = LEA64r _, 1, killed %rbp, 5, _
-    ; CHECK: %rbx = ADD64rr %rbx, killed %rbp
- 
-    %rbx = LEA64r killed %rbp, 1, killed %rbp, 5, _
-    RETQ %ebx
-
-...
----
-name:            testleaadd_rbp_index2_64
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rbx' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp, %rbx
-    ; CHECK: %rbx = LEA64r _, 4, killed %rbp, 5, _
-    ; CHECK: %rbx = ADD64rr %rbx, killed %rbp
- 
-    %rbx = LEA64r killed %rbp, 4, killed %rbp, 5, _
-    RETQ %ebx
-
-...
----
-name:            test_skip_opt_64
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rbx' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp, %rbx
-    ; CHECK: %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _
- 
-    %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _
-    RETQ %ebp
-
-...
----
-name:            test_skip_eflags_64
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rbp' }
-  - { reg: '%rax' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp, %rbx
-    ; CHECK: %rbx = LEA64r killed %rax, 4, killed %rax, 5, _
-    ; CHECK: %rbp = LEA64r killed %rbx, 4, killed %rbx, 0, _
-    ; CHECK: %rbp = ADD64ri8 %rbp, 5
-   
-    CMP64rr   %rax, killed %rbx, implicit-def %eflags
-    %rbx = LEA64r killed %rax, 4, killed %rax, 5, _
-    JE_1 %bb.1, implicit %eflags
-    RETQ %ebx
-  bb.1:
-    liveins: %rax, %rbp, %rbx
-    %rbp = LEA64r killed %rbx, 4, killed %rbx, 5, _
-    RETQ %ebp
-
-...
----
-name:            test_skip_opt_64_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rbx' }
-  - { reg: '%rbp' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp, %rbx
-    ; CHECK: %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _
- 
-    %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _
-    RETQ %ebp
-
-...
----
-name:            test_skip_eflags_64_32
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-liveins:         
-  - { reg: '%rbp' }
-  - { reg: '%rax' }
-frameInfo:       
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-body:             |
-  bb.0 (%ir-block.0):
-    liveins: %rax, %rbp, %rbx
-    ; CHECK: %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _
-    ; CHECK: %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 0, _
-    ; CHECK: %ebp = ADD32ri8 %ebp, 5
-   
-    CMP64rr   %rax, killed %rbx, implicit-def %eflags
-    %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _
-    JE_1 %bb.1, implicit %eflags
-    RETQ %ebx
-  bb.1:
-    liveins: %rax, %rbp, %rbx
-    %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 5, _
-    RETQ %ebp
-
-...
-
-
-
diff --git a/test/CodeGen/X86/nontemporal.ll b/test/CodeGen/X86/nontemporal.ll
index 33d5caba597c..d49c88724331 100644
--- a/test/CodeGen/X86/nontemporal.ll
+++ b/test/CodeGen/X86/nontemporal.ll
@@ -9,33 +9,29 @@ define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pushl %ebp
 ; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    pushl %esi
 ; X32-SSE-NEXT:    andl $-16, %esp
 ; X32-SSE-NEXT:    subl $16, %esp
 ; X32-SSE-NEXT:    movl 72(%ebp), %eax
 ; X32-SSE-NEXT:    movl 76(%ebp), %ecx
-; X32-SSE-NEXT:    movl 12(%ebp), %edx
 ; X32-SSE-NEXT:    movdqa 56(%ebp), %xmm3
 ; X32-SSE-NEXT:    movdqa 40(%ebp), %xmm4
 ; X32-SSE-NEXT:    movdqa 24(%ebp), %xmm5
-; X32-SSE-NEXT:    movl 8(%ebp), %esi
-; X32-SSE-NEXT:    addps .LCPI0_0, %xmm0
-; X32-SSE-NEXT:    movntps %xmm0, (%esi)
-; X32-SSE-NEXT:    paddq .LCPI0_1, %xmm2
-; X32-SSE-NEXT:    movntdq %xmm2, (%esi)
-; X32-SSE-NEXT:    addpd .LCPI0_2, %xmm1
-; X32-SSE-NEXT:    movntpd %xmm1, (%esi)
-; X32-SSE-NEXT:    paddd .LCPI0_3, %xmm5
-; X32-SSE-NEXT:    movntdq %xmm5, (%esi)
-; X32-SSE-NEXT:    paddw .LCPI0_4, %xmm4
-; X32-SSE-NEXT:    movntdq %xmm4, (%esi)
-; X32-SSE-NEXT:    paddb .LCPI0_5, %xmm3
-; X32-SSE-NEXT:    movntdq %xmm3, (%esi)
-; X32-SSE-NEXT:    movntil %edx, (%esi)
-; X32-SSE-NEXT:    movntil %ecx, 4(%esi)
-; X32-SSE-NEXT:    movntil %eax, (%esi)
-; X32-SSE-NEXT:    leal -4(%ebp), %esp
-; X32-SSE-NEXT:    popl %esi
+; X32-SSE-NEXT:    movl 8(%ebp), %edx
+; X32-SSE-NEXT:    addps {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movntps %xmm0, (%edx)
+; X32-SSE-NEXT:    paddq {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movntdq %xmm2, (%edx)
+; X32-SSE-NEXT:    addpd {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movntpd %xmm1, (%edx)
+; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm5
+; X32-SSE-NEXT:    movntdq %xmm5, (%edx)
+; X32-SSE-NEXT:    paddw {{\.LCPI.*}}, %xmm4
+; X32-SSE-NEXT:    movntdq %xmm4, (%edx)
+; X32-SSE-NEXT:    paddb {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    movntdq %xmm3, (%edx)
+; X32-SSE-NEXT:    movntil %ecx, 4(%edx)
+; X32-SSE-NEXT:    movntil %eax, (%edx)
+; X32-SSE-NEXT:    movl %ebp, %esp
 ; X32-SSE-NEXT:    popl %ebp
 ; X32-SSE-NEXT:    retl
 ;
@@ -43,33 +39,29 @@ define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    pushl %ebp
 ; X32-AVX-NEXT:    movl %esp, %ebp
-; X32-AVX-NEXT:    pushl %esi
 ; X32-AVX-NEXT:    andl $-16, %esp
 ; X32-AVX-NEXT:    subl $16, %esp
 ; X32-AVX-NEXT:    movl 72(%ebp), %eax
 ; X32-AVX-NEXT:    movl 76(%ebp), %ecx
-; X32-AVX-NEXT:    movl 12(%ebp), %edx
 ; X32-AVX-NEXT:    vmovdqa 56(%ebp), %xmm3
 ; X32-AVX-NEXT:    vmovdqa 40(%ebp), %xmm4
 ; X32-AVX-NEXT:    vmovdqa 24(%ebp), %xmm5
-; X32-AVX-NEXT:    movl 8(%ebp), %esi
-; X32-AVX-NEXT:    vaddps .LCPI0_0, %xmm0, %xmm0
-; X32-AVX-NEXT:    vmovntps %xmm0, (%esi)
-; X32-AVX-NEXT:    vpaddq .LCPI0_1, %xmm2, %xmm0
-; X32-AVX-NEXT:    vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT:    vaddpd .LCPI0_2, %xmm1, %xmm0
-; X32-AVX-NEXT:    vmovntpd %xmm0, (%esi)
-; X32-AVX-NEXT:    vpaddd .LCPI0_3, %xmm5, %xmm0
-; X32-AVX-NEXT:    vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT:    vpaddw .LCPI0_4, %xmm4, %xmm0
-; X32-AVX-NEXT:    vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT:    vpaddb .LCPI0_5, %xmm3, %xmm0
-; X32-AVX-NEXT:    vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT:    movntil %edx, (%esi)
-; X32-AVX-NEXT:    movntil %ecx, 4(%esi)
-; X32-AVX-NEXT:    movntil %eax, (%esi)
-; X32-AVX-NEXT:    leal -4(%ebp), %esp
-; X32-AVX-NEXT:    popl %esi
+; X32-AVX-NEXT:    movl 8(%ebp), %edx
+; X32-AVX-NEXT:    vaddps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-AVX-NEXT:    vmovntps %xmm0, (%edx)
+; X32-AVX-NEXT:    vpaddq {{\.LCPI.*}}, %xmm2, %xmm0
+; X32-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT:    vaddpd {{\.LCPI.*}}, %xmm1, %xmm0
+; X32-AVX-NEXT:    vmovntpd %xmm0, (%edx)
+; X32-AVX-NEXT:    vpaddd {{\.LCPI.*}}, %xmm5, %xmm0
+; X32-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT:    vpaddw {{\.LCPI.*}}, %xmm4, %xmm0
+; X32-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT:    vpaddb {{\.LCPI.*}}, %xmm3, %xmm0
+; X32-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT:    movntil %ecx, 4(%edx)
+; X32-AVX-NEXT:    movntil %eax, (%edx)
+; X32-AVX-NEXT:    movl %ebp, %esp
 ; X32-AVX-NEXT:    popl %ebp
 ; X32-AVX-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index 35f96eda35e1..a1f1e084d330 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -1,219 +1,169 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 
-define void @test1(i16* nocapture %head) nounwind {
+define <8 x i16> @test1(<8 x i16> %x) nounwind {
 ; SSE-LABEL: test1:
 ; SSE:       ## BB#0: ## %vector.ph
-; SSE-NEXT:    movdqu (%rdi), %xmm0
 ; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
-; SSE-NEXT:    movdqu %xmm0, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test1:
 ; AVX:       ## BB#0: ## %vector.ph
-; AVX-NEXT:    vmovdqu (%rdi), %xmm0
 ; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
 ; AVX-NEXT:    retq
 vector.ph:
-  %0 = getelementptr inbounds i16, i16* %head, i64 0
-  %1 = bitcast i16* %0 to <8 x i16>*
-  %2 = load <8 x i16>, <8 x i16>* %1, align 2
-  %3 = icmp slt <8 x i16> %2, zeroinitializer
-  %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
-  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
-  store <8 x i16> %5, <8 x i16>* %1, align 2
-  ret void
+  %0 = icmp slt <8 x i16> %x, zeroinitializer
+  %1 = xor <8 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
+  %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
 }
 
-define void @test2(i16* nocapture %head) nounwind {
+define <8 x i16> @test2(<8 x i16> %x) nounwind {
 ; SSE-LABEL: test2:
 ; SSE:       ## BB#0: ## %vector.ph
-; SSE-NEXT:    movdqu (%rdi), %xmm0
 ; SSE-NEXT:    psubusw {{.*}}(%rip), %xmm0
-; SSE-NEXT:    movdqu %xmm0, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test2:
 ; AVX:       ## BB#0: ## %vector.ph
-; AVX-NEXT:    vmovdqu (%rdi), %xmm0
 ; AVX-NEXT:    vpsubusw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
 ; AVX-NEXT:    retq
 vector.ph:
-  %0 = getelementptr inbounds i16, i16* %head, i64 0
-  %1 = bitcast i16* %0 to <8 x i16>*
-  %2 = load <8 x i16>, <8 x i16>* %1, align 2
-  %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
-  %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
-  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
-  store <8 x i16> %5, <8 x i16>* %1, align 2
-  ret void
+  %0 = icmp ugt <8 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
+  %1 = add <8 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
+  %res = select <8 x i1> %0, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
 }
 
-define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
+define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
 ; SSE-LABEL: test3:
 ; SSE:       ## BB#0: ## %vector.ph
-; SSE-NEXT:    movd %esi, %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE-NEXT:    movdqu (%rdi), %xmm1
-; SSE-NEXT:    psubusw %xmm0, %xmm1
-; SSE-NEXT:    movdqu %xmm1, (%rdi)
+; SSE-NEXT:    movd %edi, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE-NEXT:    psubusw %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test3:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovd %esi, %xmm0
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX1-NEXT:    vmovdqu (%rdi), %xmm1
-; AVX1-NEXT:    vpsubusw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT:    vmovd %edi, %xmm1
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test3:
 ; AVX2:       ## BB#0: ## %vector.ph
-; AVX2-NEXT:    vmovd %esi, %xmm0
-; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqu (%rdi), %xmm1
-; AVX2-NEXT:    vpsubusw %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT:    vmovd %edi, %xmm1
+; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 vector.ph:
   %0 = insertelement <8 x i16> undef, i16 %w, i32 0
   %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
-  %1 = getelementptr inbounds i16, i16* %head, i64 0
-  %2 = bitcast i16* %1 to <8 x i16>*
-  %3 = load <8 x i16>, <8 x i16>* %2, align 2
-  %4 = icmp ult <8 x i16> %3, %broadcast15
-  %5 = sub <8 x i16> %3, %broadcast15
-  %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
-  store <8 x i16> %6, <8 x i16>* %2, align 2
-  ret void
+  %1 = icmp ult <8 x i16> %x, %broadcast15
+  %2 = sub <8 x i16> %x, %broadcast15
+  %res = select <8 x i1> %1, <8 x i16> zeroinitializer, <8 x i16> %2
+  ret <8 x i16> %res
 }
 
-define void @test4(i8* nocapture %head) nounwind {
+define <16 x i8> @test4(<16 x i8> %x) nounwind {
 ; SSE-LABEL: test4:
 ; SSE:       ## BB#0: ## %vector.ph
-; SSE-NEXT:    movdqu (%rdi), %xmm0
 ; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
-; SSE-NEXT:    movdqu %xmm0, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test4:
 ; AVX:       ## BB#0: ## %vector.ph
-; AVX-NEXT:    vmovdqu (%rdi), %xmm0
 ; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
 ; AVX-NEXT:    retq
 vector.ph:
-  %0 = getelementptr inbounds i8, i8* %head, i64 0
-  %1 = bitcast i8* %0 to <16 x i8>*
-  %2 = load <16 x i8>, <16 x i8>* %1, align 1
-  %3 = icmp slt <16 x i8> %2, zeroinitializer
-  %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
-  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
-  store <16 x i8> %5, <16 x i8>* %1, align 1
-  ret void
+  %0 = icmp slt <16 x i8> %x, zeroinitializer
+  %1 = xor <16 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
+  ret <16 x i8> %res
 }
 
-define void @test5(i8* nocapture %head) nounwind {
+define <16 x i8> @test5(<16 x i8> %x) nounwind {
 ; SSE-LABEL: test5:
 ; SSE:       ## BB#0: ## %vector.ph
-; SSE-NEXT:    movdqu (%rdi), %xmm0
 ; SSE-NEXT:    psubusb {{.*}}(%rip), %xmm0
-; SSE-NEXT:    movdqu %xmm0, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test5:
 ; AVX:       ## BB#0: ## %vector.ph
-; AVX-NEXT:    vmovdqu (%rdi), %xmm0
 ; AVX-NEXT:    vpsubusb {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
 ; AVX-NEXT:    retq
 vector.ph:
-  %0 = getelementptr inbounds i8, i8* %head, i64 0
-  %1 = bitcast i8* %0 to <16 x i8>*
-  %2 = load <16 x i8>, <16 x i8>* %1, align 1
-  %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
-  %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
-  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
-  store <16 x i8> %5, <16 x i8>* %1, align 1
-  ret void
+  %0 = icmp ugt <16 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
+  %1 = add <16 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
+  %res = select <16 x i1> %0, <16 x i8> %1, <16 x i8> zeroinitializer
+  ret <16 x i8> %res
 }
 
-define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
+define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
 ; SSE2-LABEL: test6:
 ; SSE2:       ## BB#0: ## %vector.ph
-; SSE2-NEXT:    movd %esi, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE2-NEXT:    movdqu (%rdi), %xmm1
-; SSE2-NEXT:    psubusb %xmm0, %xmm1
-; SSE2-NEXT:    movdqu %xmm1, (%rdi)
+; SSE2-NEXT:    movd %edi, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-NEXT:    psubusb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test6:
 ; SSSE3:       ## BB#0: ## %vector.ph
-; SSSE3-NEXT:    movd %esi, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pshufb %xmm1, %xmm0
-; SSSE3-NEXT:    movdqu (%rdi), %xmm1
-; SSSE3-NEXT:    psubusb %xmm0, %xmm1
-; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
+; SSSE3-NEXT:    movd %edi, %xmm1
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    pshufb %xmm2, %xmm1
+; SSSE3-NEXT:    psubusb %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
+; SSE41-LABEL: test6:
+; SSE41:       ## BB#0: ## %vector.ph
+; SSE41-NEXT:    movd %edi, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pshufb %xmm2, %xmm1
+; SSE41-NEXT:    psubusb %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
 ; AVX1-LABEL: test6:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovd %esi, %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqu (%rdi), %xmm1
-; AVX1-NEXT:    vpsubusb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX1-NEXT:    vmovd %edi, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test6:
 ; AVX2:       ## BB#0: ## %vector.ph
-; AVX2-NEXT:    vmovd %esi, %xmm0
-; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqu (%rdi), %xmm1
-; AVX2-NEXT:    vpsubusb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX2-NEXT:    vmovd %edi, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 vector.ph:
   %0 = insertelement <16 x i8> undef, i8 %w, i32 0
   %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
-  %1 = getelementptr inbounds i8, i8* %head, i64 0
-  %2 = bitcast i8* %1 to <16 x i8>*
-  %3 = load <16 x i8>, <16 x i8>* %2, align 1
-  %4 = icmp ult <16 x i8> %3, %broadcast15
-  %5 = sub <16 x i8> %3, %broadcast15
-  %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
-  store <16 x i8> %6, <16 x i8>* %2, align 1
-  ret void
+  %1 = icmp ult <16 x i8> %x, %broadcast15
+  %2 = sub <16 x i8> %x, %broadcast15
+  %res = select <16 x i1> %1, <16 x i8> zeroinitializer, <16 x i8> %2
+  ret <16 x i8> %res
 }
 
-define void @test7(i16* nocapture %head) nounwind {
+define <16 x i16> @test7(<16 x i16> %x) nounwind {
 ; SSE-LABEL: test7:
 ; SSE:       ## BB#0: ## %vector.ph
-; SSE-NEXT:    movdqu (%rdi), %xmm0
-; SSE-NEXT:    movdqu 16(%rdi), %xmm1
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
 ; SSE-NEXT:    psubusw %xmm2, %xmm0
 ; SSE-NEXT:    psubusw %xmm2, %xmm1
-; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
-; SSE-NEXT:    movdqu %xmm0, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test7:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm1
@@ -221,43 +171,29 @@ define void @test7(i16* nocapture %head) nounwind {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovups %ymm0, (%rdi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test7:
 ; AVX2:       ## BB#0: ## %vector.ph
-; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 vector.ph:
-  %0 = getelementptr inbounds i16, i16* %head, i64 0
-  %1 = bitcast i16* %0 to <16 x i16>*
-  %2 = load <16 x i16>, <16 x i16>* %1, align 2
-  %3 = icmp slt <16 x i16> %2, zeroinitializer
-  %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
-  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
-  store <16 x i16> %5, <16 x i16>* %1, align 2
-  ret void
+  %0 = icmp slt <16 x i16> %x, zeroinitializer
+  %1 = xor <16 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
+  %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
 }
 
-define void @test8(i16* nocapture %head) nounwind {
+define <16 x i16> @test8(<16 x i16> %x) nounwind {
 ; SSE-LABEL: test8:
 ; SSE:       ## BB#0: ## %vector.ph
-; SSE-NEXT:    movdqu (%rdi), %xmm0
-; SSE-NEXT:    movdqu 16(%rdi), %xmm1
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
 ; SSE-NEXT:    psubusw %xmm2, %xmm0
 ; SSE-NEXT:    psubusw %xmm2, %xmm1
-; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
-; SSE-NEXT:    movdqu %xmm0, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test8:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
@@ -271,48 +207,33 @@ define void @test8(i16* nocapture %head) nounwind {
 ; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT:    vmovups %ymm0, (%rdi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test8:
 ; AVX2:       ## BB#0: ## %vector.ph
-; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX2-NEXT:    vpsubusw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 vector.ph:
-  %0 = getelementptr inbounds i16, i16* %head, i64 0
-  %1 = bitcast i16* %0 to <16 x i16>*
-  %2 = load <16 x i16>, <16 x i16>* %1, align 2
-  %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
-  %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
-  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
-  store <16 x i16> %5, <16 x i16>* %1, align 2
-  ret void
-
+  %0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
+  %1 = add <16 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
+  %res = select <16 x i1> %0, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
 }
 
-define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
+define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
 ; SSE-LABEL: test9:
 ; SSE:       ## BB#0: ## %vector.ph
-; SSE-NEXT:    movd %esi, %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE-NEXT:    movdqu (%rdi), %xmm1
-; SSE-NEXT:    movdqu 16(%rdi), %xmm2
-; SSE-NEXT:    psubusw %xmm0, %xmm1
-; SSE-NEXT:    psubusw %xmm0, %xmm2
-; SSE-NEXT:    movdqu %xmm2, 16(%rdi)
-; SSE-NEXT:    movdqu %xmm1, (%rdi)
+; SSE-NEXT:    movd %edi, %xmm2
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE-NEXT:    psubusw %xmm2, %xmm0
+; SSE-NEXT:    psubusw %xmm2, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test9:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovd %esi, %xmm2
+; AVX1-NEXT:    vmovd %edi, %xmm2
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
 ; AVX1-NEXT:    vpsubw %xmm2, %xmm1, %xmm3
@@ -324,47 +245,33 @@ define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
 ; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
-; AVX1-NEXT:    vmovups %ymm0, (%rdi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test9:
 ; AVX2:       ## BB#0: ## %vector.ph
-; AVX2-NEXT:    vmovd %esi, %xmm0
-; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
-; AVX2-NEXT:    vpsubusw %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vmovd %edi, %xmm1
+; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 vector.ph:
   %0 = insertelement <16 x i16> undef, i16 %w, i32 0
   %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
-  %1 = getelementptr inbounds i16, i16* %head, i64 0
-  %2 = bitcast i16* %1 to <16 x i16>*
-  %3 = load <16 x i16>, <16 x i16>* %2, align 2
-  %4 = icmp ult <16 x i16> %3, %broadcast15
-  %5 = sub <16 x i16> %3, %broadcast15
-  %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
-  store <16 x i16> %6, <16 x i16>* %2, align 2
-  ret void
+  %1 = icmp ult <16 x i16> %x, %broadcast15
+  %2 = sub <16 x i16> %x, %broadcast15
+  %res = select <16 x i1> %1, <16 x i16> zeroinitializer, <16 x i16> %2
+  ret <16 x i16> %res
 }
 
-define void @test10(i8* nocapture %head) nounwind {
+define <32 x i8> @test10(<32 x i8> %x) nounwind {
 ; SSE-LABEL: test10:
 ; SSE:       ## BB#0: ## %vector.ph
-; SSE-NEXT:    movdqu (%rdi), %xmm0
-; SSE-NEXT:    movdqu 16(%rdi), %xmm1
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; SSE-NEXT:    psubusb %xmm2, %xmm0
 ; SSE-NEXT:    psubusb %xmm2, %xmm1
-; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
-; SSE-NEXT:    movdqu %xmm0, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test10:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
@@ -372,44 +279,29 @@ define void @test10(i8* nocapture %head) nounwind {
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovups %ymm0, (%rdi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test10:
 ; AVX2:       ## BB#0: ## %vector.ph
-; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 vector.ph:
-  %0 = getelementptr inbounds i8, i8* %head, i64 0
-  %1 = bitcast i8* %0 to <32 x i8>*
-  %2 = load <32 x i8>, <32 x i8>* %1, align 1
-  %3 = icmp slt <32 x i8> %2, zeroinitializer
-  %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
-  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
-  store <32 x i8> %5, <32 x i8>* %1, align 1
-  ret void
-
+  %0 = icmp slt <32 x i8> %x, zeroinitializer
+  %1 = xor <32 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
+  ret <32 x i8> %res
 }
 
-define void @test11(i8* nocapture %head) nounwind {
+define <32 x i8> @test11(<32 x i8> %x) nounwind {
 ; SSE-LABEL: test11:
 ; SSE:       ## BB#0: ## %vector.ph
-; SSE-NEXT:    movdqu (%rdi), %xmm0
-; SSE-NEXT:    movdqu 16(%rdi), %xmm1
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; SSE-NEXT:    psubusb %xmm2, %xmm0
 ; SSE-NEXT:    psubusb %xmm2, %xmm1
-; SSE-NEXT:    movdqu %xmm1, 16(%rdi)
-; SSE-NEXT:    movdqu %xmm0, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: test11:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
@@ -423,60 +315,51 @@ define void @test11(i8* nocapture %head) nounwind {
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT:    vmovups %ymm0, (%rdi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test11:
 ; AVX2:       ## BB#0: ## %vector.ph
-; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX2-NEXT:    vpsubusb {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 vector.ph:
-  %0 = getelementptr inbounds i8, i8* %head, i64 0
-  %1 = bitcast i8* %0 to <32 x i8>*
-  %2 = load <32 x i8>, <32 x i8>* %1, align 1
-  %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
-  %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
-  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
-  store <32 x i8> %5, <32 x i8>* %1, align 1
-  ret void
+  %0 = icmp ugt <32 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
+  %1 = add <32 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
+  %res = select <32 x i1> %0, <32 x i8> %1, <32 x i8> zeroinitializer
+  ret <32 x i8> %res
 }
 
-define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
+define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
 ; SSE2-LABEL: test12:
 ; SSE2:       ## BB#0: ## %vector.ph
-; SSE2-NEXT:    movd %esi, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE2-NEXT:    movdqu (%rdi), %xmm1
-; SSE2-NEXT:    movdqu 16(%rdi), %xmm2
-; SSE2-NEXT:    psubusb %xmm0, %xmm1
-; SSE2-NEXT:    psubusb %xmm0, %xmm2
-; SSE2-NEXT:    movdqu %xmm2, 16(%rdi)
-; SSE2-NEXT:    movdqu %xmm1, (%rdi)
+; SSE2-NEXT:    movd %edi, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; SSE2-NEXT:    psubusb %xmm2, %xmm0
+; SSE2-NEXT:    psubusb %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test12:
 ; SSSE3:       ## BB#0: ## %vector.ph
-; SSSE3-NEXT:    movd %esi, %xmm0
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    pshufb %xmm1, %xmm0
-; SSSE3-NEXT:    movdqu (%rdi), %xmm1
-; SSSE3-NEXT:    movdqu 16(%rdi), %xmm2
-; SSSE3-NEXT:    psubusb %xmm0, %xmm1
-; SSSE3-NEXT:    psubusb %xmm0, %xmm2
-; SSSE3-NEXT:    movdqu %xmm2, 16(%rdi)
-; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
+; SSSE3-NEXT:    movd %edi, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    pshufb %xmm3, %xmm2
+; SSSE3-NEXT:    psubusb %xmm2, %xmm0
+; SSSE3-NEXT:    psubusb %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
+; SSE41-LABEL: test12:
+; SSE41:       ## BB#0: ## %vector.ph
+; SSE41-NEXT:    movd %edi, %xmm2
+; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pshufb %xmm3, %xmm2
+; SSE41-NEXT:    psubusb %xmm2, %xmm0
+; SSE41-NEXT:    psubusb %xmm2, %xmm1
+; SSE41-NEXT:    retq
+;
 ; AVX1-LABEL: test12:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX1-NEXT:    vmovd %esi, %xmm1
+; AVX1-NEXT:    vmovd %edi, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
@@ -489,617 +372,675 @@ define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
 ; AVX1-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vandps %ymm3, %ymm0, %ymm0
-; AVX1-NEXT:    vmovups %ymm0, (%rdi)
-; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test12:
 ; AVX2:       ## BB#0: ## %vector.ph
-; AVX2-NEXT:    vmovd %esi, %xmm0
-; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT:    vmovdqu (%rdi), %ymm1
-; AVX2-NEXT:    vpsubusb %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
-; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    vmovd %edi, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX2-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 vector.ph:
   %0 = insertelement <32 x i8> undef, i8 %w, i32 0
   %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
-  %1 = getelementptr inbounds i8, i8* %head, i64 0
-  %2 = bitcast i8* %1 to <32 x i8>*
-  %3 = load <32 x i8>, <32 x i8>* %2, align 1
-  %4 = icmp ult <32 x i8> %3, %broadcast15
-  %5 = sub <32 x i8> %3, %broadcast15
-  %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
-  store <32 x i8> %6, <32 x i8>* %2, align 1
-  ret void
+  %1 = icmp ult <32 x i8> %x, %broadcast15
+  %2 = sub <32 x i8> %x, %broadcast15
+  %res = select <32 x i1> %1, <32 x i8> zeroinitializer, <32 x i8> %2
+  ret <32 x i8> %res
 }
 
-define void @test13(i16* nocapture %head, i32* nocapture %w) nounwind {
+define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE2-LABEL: test13:
 ; SSE2:       ## BB#0: ## %vector.ph
-; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    movdqu (%rsi), %xmm2
-; SSE2-NEXT:    movdqu 16(%rsi), %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm3
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    psubd %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE2-NEXT:    psubd %xmm2, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,2,2,3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; SSE2-NEXT:    psubd %xmm1, %xmm3
 ; SSE2-NEXT:    pslld $16, %xmm0
 ; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    pslld $16, %xmm1
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    packssdw %xmm0, %xmm1
-; SSE2-NEXT:    pandn %xmm1, %xmm4
-; SSE2-NEXT:    movdqu %xmm4, (%rdi)
+; SSE2-NEXT:    pslld $16, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    packssdw %xmm0, %xmm3
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test13:
 ; SSSE3:       ## BB#0: ## %vector.ph
-; SSSE3-NEXT:    movdqu (%rdi), %xmm0
-; SSSE3-NEXT:    movdqu (%rsi), %xmm2
-; SSSE3-NEXT:    movdqu 16(%rsi), %xmm3
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    psubd %xmm3, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm3
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT:    pshufb %xmm5, %xmm3
+; SSSE3-NEXT:    psubd %xmm2, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm6
 ; SSSE3-NEXT:    pxor %xmm4, %xmm6
-; SSSE3-NEXT:    pxor %xmm1, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; SSSE3-NEXT:    pshufb %xmm5, %xmm6
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0]
-; SSSE3-NEXT:    psubd %xmm2, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
+; SSSE3-NEXT:    pshufb %xmm5, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
+; SSSE3-NEXT:    psubd %xmm1, %xmm3
 ; SSSE3-NEXT:    pshufb %xmm5, %xmm0
-; SSSE3-NEXT:    pshufb %xmm5, %xmm1
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT:    pandn %xmm1, %xmm6
-; SSSE3-NEXT:    movdqu %xmm6, (%rdi)
+; SSSE3-NEXT:    pshufb %xmm5, %xmm3
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSSE3-NEXT:    pandn %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
+; SSE41-LABEL: test13:
+; SSE41:       ## BB#0: ## %vector.ph
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm3, %xmm6
+; SSE41-NEXT:    psubd %xmm1, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    pxor %xmm4, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE41-NEXT:    pshufb %xmm1, %xmm6
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; SSE41-NEXT:    psubd %xmm2, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm3
+; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSE41-NEXT:    pandn %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
 ; AVX1-LABEL: test13:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovdqu (%rsi), %ymm0
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm6
-; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpacksswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpacksswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX1-NEXT:    vpandn %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test13:
 ; AVX2:       ## BB#0: ## %vector.ph
-; AVX2-NEXT:    vmovdqu (%rsi), %ymm0
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpacksswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 vector.ph:
-  %0 = getelementptr inbounds i16, i16* %head, i64 0
-  %1 = bitcast i16* %0 to <8 x i16>*
-  %2 = load <8 x i16>, <8 x i16>* %1, align 2
-  %3 = getelementptr inbounds i32, i32* %w, i64 0
-  %4 = bitcast i32* %3 to <8 x i32>*
-  %5 = load <8 x i32>, <8 x i32>* %4, align 2
-  %6 = zext <8 x i16> %2 to <8 x i32>
-  %7 = icmp ult <8 x i32> %6, %5
-  %8 = sub <8 x i32> %6, %5
-  %9 = trunc <8 x i32> %8 to <8 x i16>
-  %10 = select <8 x i1> %7, <8 x i16> zeroinitializer, <8 x i16> %9
-  store <8 x i16> %10, <8 x i16>* %1, align 1
-  ret void
+  %lhs = zext <8 x i16> %x to <8 x i32>
+  %cond = icmp ult <8 x i32> %lhs, %y
+  %sub = sub <8 x i32> %lhs, %y
+  %trunc = trunc <8 x i32> %sub to <8 x i16>
+  %res = select <8 x i1> %cond, <8 x i16> zeroinitializer, <8 x i16> %trunc
+  ret <8 x i16> %res
 }
 
-define void @test14(i8* nocapture %head, i32* nocapture %w) nounwind {
+define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
 ; SSE2-LABEL: test14:
 ; SSE2:       ## BB#0: ## %vector.ph
-; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    movdqu (%rsi), %xmm8
-; SSE2-NEXT:    movdqu 16(%rsi), %xmm9
-; SSE2-NEXT:    movdqu 32(%rsi), %xmm10
-; SSE2-NEXT:    movdqu 48(%rsi), %xmm7
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm7, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm7
-; SSE2-NEXT:    pxor %xmm3, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255]
-; SSE2-NEXT:    pand %xmm5, %xmm7
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
-; SSE2-NEXT:    psubd %xmm10, %xmm6
-; SSE2-NEXT:    pxor %xmm3, %xmm10
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm10
-; SSE2-NEXT:    pand %xmm5, %xmm10
-; SSE2-NEXT:    packuswb %xmm7, %xmm10
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    psubd %xmm9, %xmm1
-; SSE2-NEXT:    pxor %xmm3, %xmm9
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm9
-; SSE2-NEXT:    pand %xmm5, %xmm9
-; SSE2-NEXT:    movdqa %xmm8, %xmm4
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    pand %xmm5, %xmm4
-; SSE2-NEXT:    packuswb %xmm9, %xmm4
-; SSE2-NEXT:    packuswb %xmm10, %xmm4
-; SSE2-NEXT:    psubd %xmm8, %xmm2
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pand %xmm5, %xmm6
-; SSE2-NEXT:    packuswb %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    packuswb %xmm1, %xmm2
-; SSE2-NEXT:    packuswb %xmm6, %xmm2
-; SSE2-NEXT:    pandn %xmm2, %xmm4
-; SSE2-NEXT:    movdqu %xmm4, (%rdi)
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm5, %xmm10
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm4, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
+; SSE2-NEXT:    psubd %xmm5, %xmm4
+; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,255]
+; SSE2-NEXT:    pand %xmm9, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
+; SSE2-NEXT:    pxor %xmm0, %xmm7
+; SSE2-NEXT:    psubd %xmm10, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm10
+; SSE2-NEXT:    pand %xmm9, %xmm10
+; SSE2-NEXT:    packuswb %xmm5, %xmm10
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    psubd %xmm6, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pand %xmm9, %xmm6
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm9, %xmm0
+; SSE2-NEXT:    packuswb %xmm6, %xmm0
+; SSE2-NEXT:    packuswb %xmm10, %xmm0
+; SSE2-NEXT:    psubd %xmm8, %xmm1
+; SSE2-NEXT:    pand %xmm9, %xmm4
+; SSE2-NEXT:    pand %xmm9, %xmm3
+; SSE2-NEXT:    packuswb %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm9, %xmm2
+; SSE2-NEXT:    pand %xmm9, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    packuswb %xmm3, %xmm1
+; SSE2-NEXT:    pandn %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test14:
 ; SSSE3:       ## BB#0: ## %vector.ph
-; SSSE3-NEXT:    movdqu (%rdi), %xmm0
-; SSSE3-NEXT:    movdqu (%rsi), %xmm8
-; SSSE3-NEXT:    movdqu 16(%rsi), %xmm9
-; SSSE3-NEXT:    movdqu 32(%rsi), %xmm10
-; SSSE3-NEXT:    movdqu 48(%rsi), %xmm7
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSSE3-NEXT:    pxor %xmm7, %xmm7
+; SSSE3-NEXT:    movdqa %xmm0, %xmm11
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
+; SSSE3-NEXT:    movdqa %xmm11, %xmm8
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm10
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm9
+; SSSE3-NEXT:    pxor %xmm7, %xmm9
+; SSSE3-NEXT:    psubd %xmm0, %xmm4
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm6
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    psubd %xmm7, %xmm0
-; SSSE3-NEXT:    pxor %xmm3, %xmm7
-; SSSE3-NEXT:    pxor %xmm3, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pxor %xmm7, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm6
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm9, %xmm6
+; SSSE3-NEXT:    movdqa %xmm3, %xmm5
+; SSSE3-NEXT:    pxor %xmm7, %xmm5
+; SSSE3-NEXT:    psubd %xmm10, %xmm3
+; SSSE3-NEXT:    movdqa %xmm10, %xmm0
+; SSSE3-NEXT:    pxor %xmm7, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
+; SSSE3-NEXT:    pshufb %xmm9, %xmm0
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pxor %xmm7, %xmm5
+; SSSE3-NEXT:    psubd %xmm11, %xmm2
+; SSSE3-NEXT:    pxor %xmm7, %xmm11
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm11
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm5, %xmm11
+; SSSE3-NEXT:    movdqa %xmm1, %xmm6
+; SSSE3-NEXT:    pxor %xmm7, %xmm6
+; SSSE3-NEXT:    pxor %xmm8, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm7
 ; SSSE3-NEXT:    pshufb %xmm5, %xmm7
-; SSSE3-NEXT:    movdqa %xmm6, %xmm4
-; SSSE3-NEXT:    psubd %xmm10, %xmm6
-; SSSE3-NEXT:    pxor %xmm3, %xmm10
-; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm10
-; SSSE3-NEXT:    pshufb %xmm5, %xmm10
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
-; SSSE3-NEXT:    movdqa %xmm1, %xmm4
-; SSSE3-NEXT:    psubd %xmm9, %xmm1
-; SSSE3-NEXT:    pxor %xmm3, %xmm9
-; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm9
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT:    pshufb %xmm4, %xmm9
-; SSSE3-NEXT:    movdqa %xmm8, %xmm5
-; SSSE3-NEXT:    pxor %xmm3, %xmm5
-; SSSE3-NEXT:    pxor %xmm2, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT:    pshufb %xmm4, %xmm5
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
-; SSSE3-NEXT:    movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1]
-; SSSE3-NEXT:    psubd %xmm8, %xmm2
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSSE3-NEXT:    pand %xmm3, %xmm6
-; SSSE3-NEXT:    packuswb %xmm0, %xmm6
-; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    pand %xmm3, %xmm2
-; SSSE3-NEXT:    packuswb %xmm1, %xmm2
-; SSSE3-NEXT:    packuswb %xmm6, %xmm2
-; SSSE3-NEXT:    andnpd %xmm2, %xmm10
-; SSSE3-NEXT:    movupd %xmm10, (%rdi)
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
+; SSSE3-NEXT:    psubd %xmm8, %xmm1
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSSE3-NEXT:    pand %xmm5, %xmm4
+; SSSE3-NEXT:    pand %xmm5, %xmm3
+; SSSE3-NEXT:    packuswb %xmm4, %xmm3
+; SSSE3-NEXT:    pand %xmm5, %xmm2
+; SSSE3-NEXT:    pand %xmm5, %xmm1
+; SSSE3-NEXT:    packuswb %xmm2, %xmm1
+; SSSE3-NEXT:    packuswb %xmm3, %xmm1
+; SSSE3-NEXT:    andnpd %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
+; SSE41-LABEL: test14:
+; SSE41:       ## BB#0: ## %vector.ph
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[2,3,0,1]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm4, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    psubd %xmm6, %xmm4
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm7, %xmm6
+; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    pshufb %xmm10, %xmm6
+; SSE41-NEXT:    movdqa %xmm3, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    psubd %xmm9, %xmm3
+; SSE41-NEXT:    pxor %xmm5, %xmm9
+; SSE41-NEXT:    pcmpgtd %xmm7, %xmm9
+; SSE41-NEXT:    pshufb %xmm10, %xmm9
+; SSE41-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
+; SSE41-NEXT:    movdqa %xmm1, %xmm6
+; SSE41-NEXT:    pxor %xmm5, %xmm6
+; SSE41-NEXT:    psubd %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm5, %xmm0
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    pshufb %xmm6, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm7
+; SSE41-NEXT:    pxor %xmm5, %xmm7
+; SSE41-NEXT:    pxor %xmm8, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm7, %xmm5
+; SSE41-NEXT:    pshufb %xmm6, %xmm5
+; SSE41-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4,5,6,7]
+; SSE41-NEXT:    psubd %xmm8, %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE41-NEXT:    pand %xmm5, %xmm4
+; SSE41-NEXT:    pand %xmm5, %xmm3
+; SSE41-NEXT:    packuswb %xmm4, %xmm3
+; SSE41-NEXT:    pand %xmm5, %xmm1
+; SSE41-NEXT:    pand %xmm5, %xmm2
+; SSE41-NEXT:    packuswb %xmm2, %xmm1
+; SSE41-NEXT:    packuswb %xmm3, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    retq
+;
 ; AVX1-LABEL: test14:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovdqu (%rsi), %ymm0
-; AVX1-NEXT:    vmovdqu 32(%rsi), %ymm1
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpxor %xmm6, %xmm10, %xmm7
-; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm4
-; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpacksswb %xmm3, %xmm4, %xmm11
-; AVX1-NEXT:    vpxor %xmm6, %xmm9, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
 ; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpxor %xmm6, %xmm8, %xmm4
-; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm4
+; AVX1-NEXT:    vpxor %xmm6, %xmm10, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpacksswb %xmm3, %xmm4, %xmm11
+; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm6, %xmm9, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm3
+; AVX1-NEXT:    vpxor %xmm6, %xmm8, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm6, %xmm3
 ; AVX1-NEXT:    vpacksswb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpacksswb %xmm11, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm0, %xmm8, %xmm0
-; AVX1-NEXT:    vpsubd %xmm7, %xmm9, %xmm4
-; AVX1-NEXT:    vpsubd %xmm1, %xmm10, %xmm1
-; AVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpsubd %xmm8, %xmm5, %xmm4
+; AVX1-NEXT:    vpsubd %xmm9, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm10, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm0, %xmm7, %xmm0
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm2
-; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test14:
 ; AVX2:       ## BB#0: ## %vector.ph
-; AVX2-NEXT:    vmovdqu (%rsi), %ymm0
-; AVX2-NEXT:    vmovdqu 32(%rsi), %ymm1
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm4
-; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm5
-; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm6
+; AVX2-NEXT:    vpxor %ymm4, %ymm1, %ymm5
+; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm6
 ; AVX2-NEXT:    vpcmpgtd %ymm5, %ymm6, %ymm5
 ; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX2-NEXT:    vpacksswb %xmm6, %xmm5, %xmm5
 ; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm6
-; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm4
+; AVX2-NEXT:    vpxor %ymm4, %ymm3, %ymm4
 ; AVX2-NEXT:    vpcmpgtd %ymm6, %ymm4, %ymm4
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm6
 ; AVX2-NEXT:    vpacksswb %xmm6, %xmm4, %xmm4
-; AVX2-NEXT:    vpacksswb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpsubd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpacksswb %xmm4, %xmm5, %xmm4
+; AVX2-NEXT:    vpsubd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 vector.ph:
-  %0 = getelementptr inbounds i8, i8* %head, i64 0
-  %1 = bitcast i8* %0 to <16 x i8>*
-  %2 = load <16 x i8>, <16 x i8>* %1, align 2
-  %3 = getelementptr inbounds i32, i32* %w, i64 0
-  %4 = bitcast i32* %3 to <16 x i32>*
-  %5 = load <16 x i32>, <16 x i32>* %4, align 2
-  %6 = zext <16 x i8> %2 to <16 x i32>
-  %7 = icmp ult <16 x i32> %6, %5
-  %8 = sub <16 x i32> %6, %5
-  %9 = trunc <16 x i32> %8 to <16 x i8>
-  %10 = select <16 x i1> %7, <16 x i8> zeroinitializer, <16 x i8> %9
-  store <16 x i8> %10, <16 x i8>* %1, align 1
-  ret void
+  %rhs = zext <16 x i8> %x to <16 x i32>
+  %cond = icmp ult <16 x i32> %y, %rhs
+  %sub = sub <16 x i32> %y, %rhs
+  %truncsub = trunc <16 x i32> %sub to <16 x i8>
+  %res = select <16 x i1> %cond, <16 x i8> zeroinitializer, <16 x i8> %truncsub
+  ret <16 x i8> %res
 }
 
-define void @test15(i16* nocapture %head, i32* nocapture %w) nounwind {
+define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE2-LABEL: test15:
 ; SSE2:       ## BB#0: ## %vector.ph
-; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    movdqu (%rsi), %xmm2
-; SSE2-NEXT:    movdqu 16(%rsi), %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    psubd %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE2-NEXT:    psubd %xmm2, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE2-NEXT:    psubd %xmm1, %xmm3
 ; SSE2-NEXT:    pslld $16, %xmm0
 ; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    pslld $16, %xmm1
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    packssdw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    movdqu %xmm1, (%rdi)
+; SSE2-NEXT:    pslld $16, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    packssdw %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test15:
 ; SSSE3:       ## BB#0: ## %vector.ph
-; SSSE3-NEXT:    movdqu (%rdi), %xmm0
-; SSSE3-NEXT:    movdqu (%rsi), %xmm2
-; SSSE3-NEXT:    movdqu 16(%rsi), %xmm4
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    psubd %xmm4, %xmm0
+; SSSE3-NEXT:    psubd %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa %xmm1, %xmm6
+; SSSE3-NEXT:    pxor %xmm4, %xmm6
 ; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    pxor %xmm3, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT:    pshufb %xmm4, %xmm5
-; SSSE3-NEXT:    movdqa %xmm2, %xmm6
-; SSSE3-NEXT:    pxor %xmm3, %xmm6
-; SSSE3-NEXT:    pxor %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm3
-; SSSE3-NEXT:    pshufb %xmm4, %xmm3
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; SSSE3-NEXT:    psubd %xmm2, %xmm1
-; SSSE3-NEXT:    pshufb %xmm4, %xmm0
-; SSSE3-NEXT:    pshufb %xmm4, %xmm1
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSSE3-NEXT:    pshufb %xmm2, %xmm4
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSSE3-NEXT:    psubd %xmm1, %xmm3
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    pshufb %xmm2, %xmm3
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
 ; SSSE3-NEXT:    retq
 ;
+; SSE41-LABEL: test15:
+; SSE41:       ## BB#0: ## %vector.ph
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psubd %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    pxor %xmm4, %xmm6
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE41-NEXT:    psubd %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm1, %xmm3
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    retq
+;
 ; AVX1-LABEL: test15:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovdqu (%rsi), %ymm0
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm6
-; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpacksswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpacksswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test15:
 ; AVX2:       ## BB#0: ## %vector.ph
-; AVX2-NEXT:    vmovdqu (%rsi), %ymm0
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
-; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpacksswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpand %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 vector.ph:
-  %0 = getelementptr inbounds i16, i16* %head, i64 0
-  %1 = bitcast i16* %0 to <8 x i16>*
-  %2 = load <8 x i16>, <8 x i16>* %1, align 2
-  %3 = getelementptr inbounds i32, i32* %w, i64 0
-  %4 = bitcast i32* %3 to <8 x i32>*
-  %5 = load <8 x i32>, <8 x i32>* %4, align 2
-  %6 = zext <8 x i16> %2 to <8 x i32>
-  %7 = icmp ugt <8 x i32> %6, %5
-  %8 = sub <8 x i32> %6, %5
-  %9 = trunc <8 x i32> %8 to <8 x i16>
-  %10 = select <8 x i1> %7, <8 x i16> %9, <8 x i16> zeroinitializer
-  store <8 x i16> %10, <8 x i16>* %1, align 1
-  ret void
+  %lhs = zext <8 x i16> %x to <8 x i32>
+  %cond = icmp ugt <8 x i32> %lhs, %y
+  %sub = sub <8 x i32> %lhs, %y
+  %truncsub = trunc <8 x i32> %sub to <8 x i16>
+  %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
 }
 
-define void @test16(i16* nocapture %head, i32* nocapture %w) nounwind {
+define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
 ; SSE2-LABEL: test16:
 ; SSE2:       ## BB#0: ## %vector.ph
-; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    movdqu (%rsi), %xmm2
-; SSE2-NEXT:    movdqu 16(%rsi), %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm3, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm3
+; SSE2-NEXT:    psubd %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    pxor %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE2-NEXT:    psubd %xmm2, %xmm1
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE2-NEXT:    psubd %xmm1, %xmm3
 ; SSE2-NEXT:    pslld $16, %xmm0
 ; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    pslld $16, %xmm1
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    packssdw %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    movdqu %xmm1, (%rdi)
+; SSE2-NEXT:    pslld $16, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    packssdw %xmm0, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test16:
 ; SSSE3:       ## BB#0: ## %vector.ph
-; SSSE3-NEXT:    movdqu (%rdi), %xmm0
-; SSSE3-NEXT:    movdqu (%rsi), %xmm2
-; SSSE3-NEXT:    movdqu 16(%rsi), %xmm4
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    psubd %xmm4, %xmm0
+; SSSE3-NEXT:    psubd %xmm2, %xmm0
+; SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa %xmm1, %xmm6
+; SSSE3-NEXT:    pxor %xmm4, %xmm6
 ; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    pxor %xmm3, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT:    pshufb %xmm4, %xmm5
-; SSSE3-NEXT:    movdqa %xmm2, %xmm6
-; SSSE3-NEXT:    pxor %xmm3, %xmm6
-; SSSE3-NEXT:    pxor %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm3
-; SSSE3-NEXT:    pshufb %xmm4, %xmm3
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; SSSE3-NEXT:    psubd %xmm2, %xmm1
-; SSSE3-NEXT:    pshufb %xmm4, %xmm0
-; SSSE3-NEXT:    pshufb %xmm4, %xmm1
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT:    pand %xmm3, %xmm1
-; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
+; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSSE3-NEXT:    pshufb %xmm2, %xmm4
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSSE3-NEXT:    psubd %xmm1, %xmm3
+; SSSE3-NEXT:    pshufb %xmm2, %xmm0
+; SSSE3-NEXT:    pshufb %xmm2, %xmm3
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSSE3-NEXT:    pand %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm0
 ; SSSE3-NEXT:    retq
 ;
+; SSE41-LABEL: test16:
+; SSE41:       ## BB#0: ## %vector.ph
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm0, %xmm5
+; SSE41-NEXT:    psubd %xmm1, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm1
+; SSE41-NEXT:    pxor %xmm4, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb %xmm1, %xmm5
+; SSE41-NEXT:    movdqa %xmm2, %xmm6
+; SSE41-NEXT:    pxor %xmm4, %xmm6
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE41-NEXT:    pshufb %xmm1, %xmm4
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE41-NEXT:    psubd %xmm2, %xmm3
+; SSE41-NEXT:    pshufb %xmm1, %xmm0
+; SSE41-NEXT:    pshufb %xmm1, %xmm3
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE41-NEXT:    pand %xmm5, %xmm0
+; SSE41-NEXT:    retq
+;
 ; AVX1-LABEL: test16:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovdqu (%rsi), %ymm0
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm6
-; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpacksswb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpacksswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test16:
 ; AVX2:       ## BB#0: ## %vector.ph
-; AVX2-NEXT:    vmovdqu (%rsi), %ymm0
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
-; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
-; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpacksswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpand %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 vector.ph:
-  %0 = getelementptr inbounds i16, i16* %head, i64 0
-  %1 = bitcast i16* %0 to <8 x i16>*
-  %2 = load <8 x i16>, <8 x i16>* %1, align 2
-  %3 = getelementptr inbounds i32, i32* %w, i64 0
-  %4 = bitcast i32* %3 to <8 x i32>*
-  %5 = load <8 x i32>, <8 x i32>* %4, align 2
-  %6 = zext <8 x i16> %2 to <8 x i32>
-  %7 = icmp ult <8 x i32> %5, %6
-  %8 = sub <8 x i32> %6, %5
-  %9 = trunc <8 x i32> %8 to <8 x i16>
-  %10 = select <8 x i1> %7, <8 x i16> %9, <8 x i16> zeroinitializer
-  store <8 x i16> %10, <8 x i16>* %1, align 1
-  ret void
+  %lhs = zext <8 x i16> %x to <8 x i32>
+  %cond = icmp ult <8 x i32> %y, %lhs
+  %sub = sub <8 x i32> %lhs, %y
+  %truncsub = trunc <8 x i32> %sub to <8 x i16>
+  %res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
 }
diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll
index 16f152d169d3..5e9e1e364fef 100644
--- a/test/CodeGen/X86/store-narrow.ll
+++ b/test/CodeGen/X86/store-narrow.ll
@@ -134,10 +134,7 @@ entry:
 @g_16 = internal global i32 -1
 
 ; X64-LABEL: test8:
-; X64-NEXT: movl _g_16(%rip), %eax
-; X64-NEXT: movl $0, _g_16(%rip)
-; X64-NEXT: orl  $1, %eax
-; X64-NEXT: movl %eax, _g_16(%rip)
+; X64-NEXT: orb  $1, _g_16(%rip)
 ; X64-NEXT: ret
 define void @test8() nounwind {
   %tmp = load i32, i32* @g_16
diff --git a/test/CodeGen/X86/swift-return.ll b/test/CodeGen/X86/swift-return.ll
index 60e33e62b4ad..0ea176d5d82f 100644
--- a/test/CodeGen/X86/swift-return.ll
+++ b/test/CodeGen/X86/swift-return.ll
@@ -184,11 +184,11 @@ define void @consume_i1_ret() {
   %v6 = extractvalue { i1, i1, i1, i1 } %call, 2
   %v7 = extractvalue { i1, i1, i1, i1 } %call, 3
   %val = zext i1 %v3 to i32
-  store i32 %val, i32* @var
+  store volatile i32 %val, i32* @var
   %val2 = zext i1 %v5 to i32
-  store i32 %val2, i32* @var
+  store volatile i32 %val2, i32* @var
   %val3 = zext i1 %v6 to i32
-  store i32 %val3, i32* @var
+  store volatile i32 %val3, i32* @var
   %val4 = zext i1 %v7 to i32
   store i32 %val4, i32* @var
   ret void
diff --git a/test/CodeGen/X86/win32-spill-xmm.ll b/test/CodeGen/X86/win32-spill-xmm.ll
index 0db97cfe20f0..c6b163b88b24 100644
--- a/test/CodeGen/X86/win32-spill-xmm.ll
+++ b/test/CodeGen/X86/win32-spill-xmm.ll
@@ -20,7 +20,7 @@ declare void @bar(<16 x float> %a, i32 %b)
 ; Check that proper alignment of spilled vector does not affect vargs
 
 ; CHECK-LABEL: vargs_not_affected
-; CHECK: leal    28(%ebp), %eax
+; CHECK: movl 28(%ebp), %eax
 define i32 @vargs_not_affected(<4 x float> %v, i8* %f, ...) {
 entry:
   %ap = alloca i8*, align 4
diff --git a/test/CodeGen/X86/win64_sibcall.ll b/test/CodeGen/X86/win64_sibcall.ll
index 4bba0e1e0acd..42dd4d31ca9f 100644
--- a/test/CodeGen/X86/win64_sibcall.ll
+++ b/test/CodeGen/X86/win64_sibcall.ll
@@ -12,8 +12,8 @@ entry:
 ; LINUX:	movq	$0, -8(%rsp)
 
   %this = alloca %Object addrspace(1)*
-  store %Object addrspace(1)* null, %Object addrspace(1)** %this
-  store %Object addrspace(1)* %param0, %Object addrspace(1)** %this
+  store volatile %Object addrspace(1)* null, %Object addrspace(1)** %this
+  store volatile %Object addrspace(1)* %param0, %Object addrspace(1)** %this
   br label %0
 
 ; <label>:0                                       ; preds = %entry
diff --git a/test/CodeGen/X86/win64_vararg.ll b/test/CodeGen/X86/win64_vararg.ll
index 8d7f2010a541..20386bf36395 100644
--- a/test/CodeGen/X86/win64_vararg.ll
+++ b/test/CodeGen/X86/win64_vararg.ll
@@ -94,9 +94,7 @@ entry:
 
 ; CHECK-LABEL: arg4:
 ; CHECK: pushq
-; va_start:
-; CHECK: leaq 48(%rsp), [[REG_arg4_1:%[a-z]+]]
-; CHECK: movq [[REG_arg4_1]], (%rsp)
+; va_start (optimized away as overwritten by va_arg)
 ; va_arg:
 ; CHECK: leaq 52(%rsp), [[REG_arg4_2:%[a-z]+]]
 ; CHECK: movq [[REG_arg4_2]], (%rsp)
diff --git a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
index e3436521a5bd..299190e8a595 100644
--- a/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
+++ b/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
@@ -90,9 +90,7 @@ entry:
 }
 
 ; CHECK-LABEL: arg4:
-; va_start:
-; CHECK: leaq 48(%rsp), [[REG_arg4_1:%[a-z]+]]
-; CHECK: movq [[REG_arg4_1]], (%rsp)
+; va_start (optimized away as overwritten by va_arg)
 ; va_arg:
 ; CHECK: leaq 52(%rsp), [[REG_arg4_2:%[a-z]+]]
 ; CHECK: movq [[REG_arg4_2]], (%rsp)