228 files changed, 9093 insertions, 1574 deletions
diff --git a/test/Analysis/CostModel/AArch64/falkor.ll b/test/Analysis/CostModel/AArch64/falkor.ll
deleted file mode 100644
index e9563191f077..000000000000
--- a/test/Analysis/CostModel/AArch64/falkor.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: opt < %s -cost-model -analyze -mcpu=falkor | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-; CHECK-LABEL: vectorInstrCost
-define void @vectorInstrCost() {
-
-    ; Vector extracts - extracting the first element should have a zero cost;
-    ; all other elements should have a cost of two.
-    ;
-    ; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0
-    ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1
-    %t1 = extractelement <2 x i64> undef, i32 0
-    %t2 = extractelement <2 x i64> undef, i32 1
-
-    ; Vector inserts - inserting the first element should have a zero cost; all
-    ; other elements should have a cost of two.
-    ;
-    ; CHECK: cost of 0 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0
-    ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1
-    %t3 = insertelement <2 x i64> undef, i64 undef, i32 0
-    %t4 = insertelement <2 x i64> undef, i64 undef, i32 1
-
-    ret void
-}
diff --git a/test/Analysis/Delinearization/constant_functions_multi_dim.ll b/test/Analysis/Delinearization/constant_functions_multi_dim.ll
new file mode 100644
index 000000000000..b44b900d3f52
--- /dev/null
+++ b/test/Analysis/Delinearization/constant_functions_multi_dim.ll
@@ -0,0 +1,80 @@
+; RUN: opt -delinearize -analyze < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK:      Inst:  %tmp = load float, float* %arrayidx, align 4
+; CHECK-NEXT: In Loop with Header: for.inc
+; CHECK-NEXT: AccessFunction: {(4 * %N * %call),+,4}<nsw><%for.inc>
+; CHECK-NEXT: Base offset: %A
+; CHECK-NEXT: ArrayDecl[UnknownSize][%N] with elements of 4 bytes.
+; CHECK-NEXT: ArrayRef[%call][{0,+,1}<nuw><nsw><%for.inc>]
+
+; CHECK:      Inst:  %tmp5 = load float, float* %arrayidx4, align 4
+; CHECK-NEXT: In Loop with Header: for.inc
+; CHECK-NEXT: AccessFunction: {(4 * %call1),+,(4 * %N)}<nsw><%for.inc>
+; CHECK-NEXT: Base offset: %B
+; CHECK-NEXT: ArrayDecl[UnknownSize][%N] with elements of 4 bytes.
+; CHECK-NEXT: ArrayRef[{0,+,1}<nuw><nsw><%for.inc>][%call1]
+
+; Function Attrs: noinline nounwind uwtable
+define void @mat_mul(float* %C, float* %A, float* %B, i64 %N) #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  %call = tail call i64 @_Z13get_global_idj(i32 0) #3
+  %call1 = tail call i64 @_Z13get_global_idj(i32 1) #3
+  %cmp1 = icmp sgt i64 %N, 0
+  %mul = mul nsw i64 %call, %N
+  br i1 %cmp1, label %for.inc.lr.ph, label %for.end
+
+for.inc.lr.ph:                                    ; preds = %entry.split
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.lr.ph, %for.inc
+  %acc.03 = phi float [ 0.000000e+00, %for.inc.lr.ph ], [ %tmp6, %for.inc ]
+  %m.02 = phi i64 [ 0, %for.inc.lr.ph ], [ %inc, %for.inc ]
+  %add = add nsw i64 %m.02, %mul
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %add
+  %tmp = load float, float* %arrayidx, align 4
+  %mul2 = mul nsw i64 %m.02, %N
+  %add3 = add nsw i64 %mul2, %call1
+  %arrayidx4 = getelementptr inbounds float, float* %B, i64 %add3
+  %tmp5 = load float, float* %arrayidx4, align 4
+  %tmp6 = tail call float @llvm.fmuladd.f32(float %tmp, float %tmp5, float %acc.03)
+  %inc = add nuw nsw i64 %m.02, 1
+  %exitcond = icmp ne i64 %inc, %N
+  br i1 %exitcond, label %for.inc, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.inc
+  %.lcssa = phi float [ %tmp6, %for.inc ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
+  %acc.0.lcssa = phi float [ %.lcssa, %for.cond.for.end_crit_edge ], [ 0.000000e+00, %entry.split ]
+  %add7 = add nsw i64 %mul, %call1
+  %arrayidx8 = getelementptr inbounds float, float* %C, i64 %add7
+  store float %acc.0.lcssa, float* %arrayidx8, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @_Z13get_global_idj(i32) #1
+
+; Function Attrs: nounwind readnone speculatable
+declare float @llvm.fmuladd.f32(float, float, float) #2
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 303846) (llvm/trunk 303834)"}
+!2 = !{i32 1, i32 1, i32 1, i32 0}
+!3 = !{!"none", !"none", !"none", !"none"}
+!4 = !{!"float*", !"float*", !"float*", !"long"}
+!5 = !{!"", !"", !"", !""}
diff --git a/test/Analysis/IVUsers/quadradic-exit-value.ll b/test/Analysis/IVUsers/quadradic-exit-value.ll
index 6d4f1b039b48..afc215198218 100644
--- a/test/Analysis/IVUsers/quadradic-exit-value.ll
+++ b/test/Analysis/IVUsers/quadradic-exit-value.ll
@@ -30,13 +30,47 @@ exit:
   ret i64 %r
 }
 
+; PR15470: LSR miscompile. The test1 function should return '1'.
+; It is valid to fold SCEVUnknown into the recurrence because it
+; was defined before the loop.
+;
+; SCEV does not know how to denormalize chained recurrences, so make
+; sure they aren't marked as post-inc users.
+;
+; CHECK-LABEL: IV Users for loop %test1.loop
+; CHECK-NO-LCSSA: %sext.us = {0,+,(16777216 + (-16777216 * %sub.us))<nuw><nsw>,+,33554432}<%test1.loop> (post-inc with loop %test1.loop) in    %f = ashr i32 %sext.us, 24
+define i32 @test1(i1 %cond) {
+entry:
+  %sub.us = select i1 %cond, i32 0, i32 0
+  br label %test1.loop
+
+test1.loop:
+  %inc1115.us = phi i32 [ 0, %entry ], [ %inc11.us, %test1.loop ]
+  %inc11.us = add nsw i32 %inc1115.us, 1
+  %cmp.us = icmp slt i32 %inc11.us, 2
+  br i1 %cmp.us, label %test1.loop, label %for.end
+
+for.end:
+  %tobool.us = icmp eq i32 %inc1115.us, 0
+  %mul.us = shl i32 %inc1115.us, 24
+  %sub.cond.us = sub nsw i32 %inc1115.us, %sub.us
+  %sext.us = mul i32 %mul.us, %sub.cond.us
+  %f = ashr i32 %sext.us, 24
+  br label %exit
+
+exit:
+  ret i32 %f
+}
+
 ; PR15470: LSR miscompile. The test2 function should return '1'.
+; It is illegal to fold SCEVUnknown (sext.us) into the recurrence
+; because it is defined after the loop where this recurrence belongs.
 ;
 ; SCEV does not know how to denormalize chained recurrences, so make
 ; sure they aren't marked as post-inc users.
 ;
 ; CHECK-LABEL: IV Users for loop %test2.loop
-; CHECK-NO-LCSSA: %sext.us = {0,+,(16777216 + (-16777216 * %sub.us))<nuw><nsw>,+,33554432}<%test2.loop> (post-inc with loop %test2.loop) in    %f = ashr i32 %sext.us, 24
+; CHECK-NO-LCSSA: %sub.cond.us = ((-1 * %sub.us)<nsw> + {0,+,1}<nuw><nsw><%test2.loop>) (post-inc with loop %test2.loop) in    %sext.us = mul i32 %mul.us, %sub.cond.us
 define i32 @test2() {
 entry:
   br label %test2.loop
diff --git a/test/Analysis/ScalarEvolution/different-loops-recs.ll b/test/Analysis/ScalarEvolution/different-loops-recs.ll
index ad3d1e0bd110..6b88f09e936f 100644
--- a/test/Analysis/ScalarEvolution/different-loops-recs.ll
+++ b/test/Analysis/ScalarEvolution/different-loops-recs.ll
@@ -220,7 +220,8 @@ exit:
 
 ; Mix of previous use cases that demonstrates %s3 can be incorrectly treated as
 ; a recurrence of loop1 because of operands order if we pick recurrencies in an
-; incorrect order.
+; incorrect order. It also shows that we cannot safely fold v1 (SCEVUnknown)
+; because we cannot prove for sure that it doesn't use Phis of loop 2.
 
 define void @test_03(i32 %a, i32 %b, i32 %c, i32* %p) {
 
@@ -228,9 +229,9 @@ define void @test_03(i32 %a, i32 %b, i32 %c, i32* %p) {
 ; CHECK:       %v1 = load i32, i32* %p
 ; CHECK-NEXT:  -->  %v1
 ; CHECK:       %s1 = add i32 %phi1, %v1
-; CHECK-NEXT:  -->  {(%a + %v1),+,1}<%loop1>
+; CHECK-NEXT:  -->  ({%a,+,1}<%loop1> + %v1)
 ; CHECK:       %s2 = add i32 %s1, %b
-; CHECK-NEXT:  -->  {(%a + %b + %v1),+,1}<%loop1>
+; CHECK-NEXT:  -->  ({(%a + %b),+,1}<%loop1> + %v1)
 ; CHECK:       %s3 = add i32 %s2, %phi2
 ; CHECK-NEXT:  -->  ({{{{}}((2 * %a) + %b),+,1}<%loop1>,+,2}<%loop2> + %v1)
 
@@ -452,3 +453,60 @@ exit:
   %s6 = add i32 %phi3, %phi2
   ret void
 }
+
+; Make sure that a complicated Phi does not get folded with rec's start value
+; of a loop which is above.
+define void @test_08() {
+
+; CHECK-LABEL: Classifying expressions for: @test_08
+; CHECK:       %tmp11 = add i64 %iv.2.2, %iv.2.1
+; CHECK-NEXT:  -->  ({0,+,-1}<nsw><%loop_2> + %iv.2.1)
+; CHECK:       %tmp12 = trunc i64 %tmp11 to i32
+; CHECK-NEXT:  -->  (trunc i64 ({0,+,-1}<nsw><%loop_2> + %iv.2.1) to i32)
+; CHECK:       %tmp14 = mul i32 %tmp12, %tmp7
+; CHECK-NEXT:  -->  ((trunc i64 ({0,+,-1}<nsw><%loop_2> + %iv.2.1) to i32) * {-1,+,-1}<%loop_1>)
+; CHECK:       %tmp16 = mul i64 %iv.2.1, %iv.1.1
+; CHECK-NEXT:  -->  ({2,+,1}<nuw><nsw><%loop_1> * %iv.2.1)
+
+entry:
+  br label %loop_1
+
+loop_1:
+  %iv.1.1 = phi i64 [ 2, %entry ], [ %iv.1.1.next, %loop_1_back_branch ]
+  %iv.1.2 = phi i32 [ -1, %entry ], [ %iv.1.2.next, %loop_1_back_branch ]
+  br label %loop_1_exit
+
+dead:
+  br label %loop_1_exit
+
+loop_1_exit:
+  %tmp5 = icmp sgt i64 %iv.1.1, 2
+  br i1 %tmp5, label %loop_2_preheader, label %loop_1_back_branch
+
+loop_1_back_branch:
+  %iv.1.1.next = add nuw nsw i64 %iv.1.1, 1
+  %iv.1.2.next = add nsw i32 %iv.1.2, 1
+  br label %loop_1
+
+loop_2_preheader:
+  %tmp6 = sub i64 1, %iv.1.1
+  %tmp7 = trunc i64 %tmp6 to i32
+  br label %loop_2
+
+loop_2:
+  %iv.2.1 = phi i64 [ 0, %loop_2_preheader ], [ %tmp16, %loop_2 ]
+  %iv.2.2 = phi i64 [ 0, %loop_2_preheader ], [ %iv.2.2.next, %loop_2 ]
+  %iv.2.3 = phi i64 [ 2, %loop_2_preheader ], [ %iv.2.3.next, %loop_2 ]
+  %tmp11 = add i64 %iv.2.2, %iv.2.1
+  %tmp12 = trunc i64 %tmp11 to i32
+  %tmp14 = mul i32 %tmp12, %tmp7
+  %tmp16 = mul i64 %iv.2.1, %iv.1.1
+  %iv.2.3.next = add nuw nsw i64 %iv.2.3, 1
+  %iv.2.2.next = add nsw i64 %iv.2.2, -1
+  %tmp17 = icmp slt i64 %iv.2.3.next, %iv.1.1
+  br i1 %tmp17, label %loop_2, label %exit
+
+exit:
+  %tmp10 = add i32 %iv.1.2, 3
+  ret void
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index 71ea9d54f647..0298315a5510 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -43,7 +43,7 @@ define [1 x double] @constant() {
   ; The key problem here is that we may fail to create an MBB referenced by a
   ; PHI. If so, we cannot complete the G_PHI and mustn't try or bad things
   ; happen.
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: cannot select: G_STORE %vreg4, %vreg2; mem:ST4[%addr] GPR:%vreg4,%vreg2 (in function: pending_phis)
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: cannot select: G_STORE %vreg5, %vreg2; mem:ST4[%addr] GPR:%vreg5,%vreg2 (in function: pending_phis)
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for pending_phis
 ; FALLBACK-WITH-REPORT-OUT-LABEL: pending_phis:
 define i32 @pending_phis(i1 %tst, i32 %val, i32* %addr) {
diff --git a/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll b/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll
index 3ecdb7bbedfb..0972840de47b 100644
--- a/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll
+++ b/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
 ; RUN:   -O0 -aarch64-enable-global-isel-at-O=0 \
-; RUN:   | FileCheck %s --check-prefix ENABLED --check-prefix NOFALLBACK
+; RUN:   | FileCheck %s --check-prefix ENABLED --check-prefix ENABLED-O0 --check-prefix NOFALLBACK
 
 ; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
 ; RUN:   -O0 -aarch64-enable-global-isel-at-O=0 -global-isel-abort=2  \
-; RUN:   | FileCheck %s --check-prefix ENABLED --check-prefix FALLBACK
+; RUN:   | FileCheck %s --check-prefix ENABLED --check-prefix ENABLED-O0 --check-prefix FALLBACK
 
 ; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
 ; RUN:   -global-isel \
@@ -32,6 +32,7 @@
 ; ENABLED:       IRTranslator
 ; ENABLED-NEXT:  Legalizer
 ; ENABLED-NEXT:  RegBankSelect
+; ENABLED-O0-NEXT:  Localizer
 ; ENABLED-NEXT:  InstructionSelect
 ; ENABLED-NEXT:  ResetMachineFunction
 
diff --git a/test/CodeGen/AArch64/GlobalISel/localizer-in-O0-pipeline.mir b/test/CodeGen/AArch64/GlobalISel/localizer-in-O0-pipeline.mir
new file mode 100644
index 000000000000..ea8a77ca3917
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/localizer-in-O0-pipeline.mir
@@ -0,0 +1,96 @@
+# RUN: llc -O0 -mtriple aarch64-apple-ios %s -global-isel -start-after regbankselect \
+# RUN:    -stop-before instruction-select -o - | FileCheck --check-prefix=CHECK --check-prefix=OPTNONE %s
+# RUN: llc -mtriple aarch64-apple-ios %s -global-isel -start-after regbankselect \
+# RUN:   -stop-before instruction-select -o - | FileCheck --check-prefix=CHECK --check-prefix=OPT %s
+#
+# Check that we are only running the localizer at O0 and that it runs
+# between the regbankselect pass and the instruction-select.
+# Moreover, check that it does what we expect.
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-apple-ios"
+  
+  define float @foo(float %arg, i1 %cond) {
+    br i1 %cond, label %true, label %false
+  
+  true:                                             ; preds = %0
+    br label %end
+  
+  false:                                            ; preds = %0
+    br label %end
+  
+  end:                                              ; preds = %false, %true
+    %val = phi float [ 1.000000e+00, %true ], [ 2.000000e+00, %false ]
+    %res = fadd float %arg, %val
+    ret float %res
+  }
+
+...
+---
+# CHECK-LABEL: name: foo
+name:            foo
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: fpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+# CHECK-NEXT: - { id: 2, class: fpr }
+# CHECK-NEXT: - { id: 3, class: fpr }
+# CHECK-NEXT: - { id: 4, class: fpr }
+# CHECK-NEXT: - { id: 5, class: fpr }
+# The localizer will create two new values to materialize the constants.
+# OPTNONE-NEXT:  - { id: 6, class: fpr }
+# OPTNONE-NEXT:  - { id: 7, class: fpr }
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+  - { id: 4, class: fpr }
+  - { id: 5, class: fpr }
+
+# First block remains untouched
+# CHECK: body
+# CHECK:   %4(s32) = G_FCONSTANT float 1.000000e+00
+# CHECK:   %5(s32) = G_FCONSTANT float 2.000000e+00
+
+# Second block will get the constant 1.0 when the localizer is enabled.
+# CHECK: bb.1.true:
+# OPT-NOT: G_FCONSTANT
+# OPTNONE: [[FONE:%[0-9]+]](s32) = G_FCONSTANT float 1.000000e+00
+# CHECK: G_BR %bb.3.end
+
+# Thrid block will get the constant 2.0 when the localizer is enabled.
+# CHECK: bb.2.false:
+# OPT-NOT: G_FCONSTANT
+# OPTNONE: [[FTWO:%[0-9]+]](s32) = G_FCONSTANT float 2.000000e+00
+
+# CHECK: bb.3.end
+# OPTNONE: %2(s32) = PHI [[FONE]](s32), %bb.1.true, [[FTWO]](s32), %bb.2.false
+# OPT: %2(s32) = PHI %4(s32), %bb.1.true, %5(s32), %bb.2.false
+# CHECK-NEXT: G_FADD %0, %2
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %s0, %w0
+
+    %0(s32) = COPY %s0
+    %1(s1) = COPY %w0
+    %4(s32) = G_FCONSTANT float 1.000000e+00
+    %5(s32) = G_FCONSTANT float 2.000000e+00
+    G_BRCOND %1(s1), %bb.1.true
+    G_BR %bb.2.false
+  
+  bb.1.true:
+    G_BR %bb.3.end
+  
+  bb.2.false:
+  
+  bb.3.end:
+    %2(s32) = PHI %4(s32), %bb.1.true, %5(s32), %bb.2.false
+    %3(s32) = G_FADD %0, %2
+    %s0 = COPY %3(s32)
+    RET_ReallyLR implicit %s0
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/localizer.mir b/test/CodeGen/AArch64/GlobalISel/localizer.mir
new file mode 100644
index 000000000000..8fbb2040157e
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/localizer.mir
@@ -0,0 +1,312 @@
+# RUN: llc -O0 -mtriple=aarch64-apple-ios -run-pass=localizer -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK
+
+# Test the localizer.
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @local_use() { ret void }
+  define void @non_local_1use() { ret void }
+  define void @non_local_2uses() { ret void }
+  define void @non_local_phi_use() { ret void }
+  define void @non_local_phi_use_followed_by_use() { ret void }
+  define void @non_local_phi_use_followed_by_use_fi() { ret void }
+  define void @float_non_local_phi_use_followed_by_use_fi() { ret void }
+...
+
+---
+# CHECK-LABEL: name: local_use
+name:            local_use
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_CONSTANT 1
+# CHECK-NEXT: %1(s32) = G_ADD %0, %0
+body:             |
+  bb.0:
+    %0(s32) = G_CONSTANT 1
+    %1(s32) = G_ADD %0, %0
+...
+
+---
+# CHECK-LABEL: name: non_local_1use
+name:            non_local_1use
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# Existing registers should be left untouched
+# CHECK:  - { id: 0, class: gpr }
+#CHECK-NEXT:  - { id: 1, class: gpr }
+#CHECK-NEXT:  - { id: 2, class: gpr }
+# The newly created reg should be on the same regbank/regclass as its origin.
+#CHECK-NEXT:  - { id: 3, class: gpr }
+
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_CONSTANT 1
+# CHECK-NEXT: %1(s32) = G_ADD %0, %0
+
+# CHECK: bb.1:
+# CHECK: %3(s32) = G_CONSTANT 1
+# CHECK-NEXT: %2(s32) = G_ADD %3, %1
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %0(s32) = G_CONSTANT 1
+    %1(s32) = G_ADD %0, %0
+
+  bb.1:
+    %2(s32) = G_ADD %0, %1
+...
+
+
+---
+# CHECK-LABEL: name: non_local_2uses
+name:            non_local_2uses
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# Existing registers should be left untouched
+# CHECK:  - { id: 0, class: gpr }
+#CHECK-NEXT:  - { id: 1, class: gpr }
+#CHECK-NEXT:  - { id: 2, class: gpr }
+# The newly created reg should be on the same regbank/regclass as its origin.
+#CHECK-NEXT:  - { id: 3, class: gpr }
+
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_CONSTANT 1
+# CHECK-NEXT: %1(s32) = G_ADD %0, %0
+
+# CHECK: bb.1:
+# CHECK: %3(s32) = G_CONSTANT 1
+# CHECK-NEXT: %2(s32) = G_ADD %3, %3
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %0(s32) = G_CONSTANT 1
+    %1(s32) = G_ADD %0, %0
+
+  bb.1:
+    %2(s32) = G_ADD %0, %0
+...
+
+---
+# CHECK-LABEL: name: non_local_phi_use
+name:            non_local_phi_use
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+# CHECK:      registers:
+# Existing registers should be left untouched
+# CHECK:  - { id: 0, class: gpr }
+#CHECK-NEXT:  - { id: 1, class: gpr }
+#CHECK-NEXT:  - { id: 2, class: gpr }
+#CHECK-NEXT:  - { id: 3, class: gpr }
+#CHECK-NEXT:  - { id: 4, class: gpr }
+# The newly created reg should be on the same regbank/regclass as its origin.
+#CHECK-NEXT:  - { id: 5, class: gpr }
+
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_CONSTANT 1
+# CHECK-NEXT: %1(s32) = G_ADD %0, %0
+
+# CHECK: bb.1:
+# CHECK: %5(s32) = G_CONSTANT 1
+
+# CHECK: bb.2:
+# CHECK: %3(s32) = PHI %5(s32), %bb.1
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %0(s32) = G_CONSTANT 1
+    %1(s32) = G_ADD %0, %0
+
+  bb.1:
+    successors: %bb.2
+
+  bb.2:
+    %3(s32) = PHI %0(s32), %bb.1
+    %2(s32) = G_ADD %3, %3
+...
+
+---
+# CHECK-LABEL: name: non_local_phi_use_followed_by_use
+name:            non_local_phi_use_followed_by_use
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+# CHECK:      registers:
+# Existing registers should be left untouched
+# CHECK:  - { id: 0, class: gpr }
+#CHECK-NEXT:  - { id: 1, class: gpr }
+#CHECK-NEXT:  - { id: 2, class: gpr }
+#CHECK-NEXT:  - { id: 3, class: gpr }
+#CHECK-NEXT:  - { id: 4, class: gpr }
+# The newly created regs should be on the same regbank/regclass as its origin.
+#CHECK-NEXT:  - { id: 5, class: gpr }
+#CHECK-NEXT:  - { id: 6, class: gpr }
+
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_CONSTANT 1
+# CHECK-NEXT: %1(s32) = G_ADD %0, %0
+
+# CHECK: bb.1:
+# CHECK: %5(s32) = G_CONSTANT 1
+
+# CHECK: bb.2:
+# CHECK: %3(s32) = PHI %5(s32), %bb.1
+# CHECK-NEXT: %6(s32) = G_CONSTANT 1
+# CHECK-NEXT: %2(s32) = G_ADD %3, %6
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %0(s32) = G_CONSTANT 1
+    %1(s32) = G_ADD %0, %0
+
+  bb.1:
+    successors: %bb.2
+
+  bb.2:
+    %3(s32) = PHI %0(s32), %bb.1
+    %2(s32) = G_ADD %3, %0
+...
+
+---
+# CHECK-LABEL: name: non_local_phi_use_followed_by_use_fi
+name:            non_local_phi_use_followed_by_use_fi
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+# CHECK:      registers:
+# Existing registers should be left untouched
+# CHECK:  - { id: 0, class: gpr }
+#CHECK-NEXT:  - { id: 1, class: gpr }
+#CHECK-NEXT:  - { id: 2, class: gpr }
+#CHECK-NEXT:  - { id: 3, class: gpr }
+#CHECK-NEXT:  - { id: 4, class: gpr }
+# The newly created reg should be on the same regbank/regclass as its origin.
+#CHECK-NEXT:  - { id: 5, class: gpr }
+#CHECK-NEXT:  - { id: 6, class: gpr }
+
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_FRAME_INDEX 1
+# CHECK-NEXT: %1(s32) = G_ADD %0, %0
+
+# CHECK: bb.1:
+# CHECK: %5(s32) = G_FRAME_INDEX 1
+
+# CHECK: bb.2:
+# CHECK: %3(s32) = PHI %5(s32), %bb.1
+# CHECK-NEXT: %6(s32) = G_FRAME_INDEX 1
+# CHECK-NEXT: %2(s32) = G_ADD %3, %6
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %0(s32) = G_FRAME_INDEX 1
+    %1(s32) = G_ADD %0, %0
+
+  bb.1:
+    successors: %bb.2
+
+  bb.2:
+    %3(s32) = PHI %0(s32), %bb.1
+    %2(s32) = G_ADD %3, %0
+...
+
+---
+# CHECK-LABEL: name: float_non_local_phi_use_followed_by_use_fi
+name:            float_non_local_phi_use_followed_by_use_fi
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+# CHECK:      registers:
+# Existing registers should be left untouched
+# CHECK:  - { id: 0, class: fpr }
+#CHECK-NEXT:  - { id: 1, class: fpr }
+#CHECK-NEXT:  - { id: 2, class: fpr }
+#CHECK-NEXT:  - { id: 3, class: fpr }
+#CHECK-NEXT:  - { id: 4, class: fpr }
+# The newly created reg should be on the same regbank/regclass as its origin.
+#CHECK-NEXT:  - { id: 5, class: fpr }
+#CHECK-NEXT:  - { id: 6, class: fpr }
+
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+  - { id: 4, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_FCONSTANT float 1.0
+# CHECK-NEXT: %1(s32) = G_FADD %0, %0
+
+# CHECK: bb.1:
+# CHECK: %5(s32) = G_FCONSTANT float 1.0
+
+# CHECK: bb.2:
+# CHECK: %3(s32) = PHI %5(s32), %bb.1
+# CHECK-NEXT: %6(s32) = G_FCONSTANT float 1.0
+# CHECK-NEXT: %2(s32) = G_FADD %3, %6
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %0(s32) = G_FCONSTANT float 1.0
+    %1(s32) = G_FADD %0, %0
+
+  bb.1:
+    successors: %bb.2
+
+  bb.2:
+    %3(s32) = PHI %0(s32), %bb.1
+    %2(s32) = G_FADD %3, %0
+...
diff --git a/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/test/CodeGen/AArch64/aarch64-stp-cluster.ll
index fe5abbf15eff..25cf313b81e7 100644
--- a/test/CodeGen/AArch64/aarch64-stp-cluster.ll
+++ b/test/CodeGen/AArch64/aarch64-stp-cluster.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s
 
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: stp_i64_scale:BB#0
diff --git a/test/CodeGen/AArch64/arm64-csldst-mmo.ll b/test/CodeGen/AArch64/arm64-csldst-mmo.ll
index 4930c493d62c..cfb8e3a38c49 100644
--- a/test/CodeGen/AArch64/arm64-csldst-mmo.ll
+++ b/test/CodeGen/AArch64/arm64-csldst-mmo.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
 ; REQUIRES: asserts
 
 @G = external global [0 x i32], align 4
diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
index f28d0ab07c5a..f849df2a51ec 100644
--- a/test/CodeGen/AArch64/arm64-inline-asm.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -254,3 +254,10 @@ define void @test_constraint_w(i32 %a) {
   tail call void asm sideeffect "sqxtn h0, ${0:s}\0A", "w"(i32 %a)
   ret void
 }
+
+define void @test_inline_modifier_a(i8* %ptr) nounwind {
+  ; CHECK-LABEL: test_inline_modifier_a:
+  tail call void asm sideeffect "prfm pldl1keep, ${0:a}\0A", "r"(i8* %ptr)
+  ; CHECK: prfm pldl1keep, [x0]
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index 0cfbe5958f4d..64e535ca7499 100644
--- a/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1 -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck --check-prefix=EXYNOS %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck --check-prefix=EXYNOS %s
 
 ; Test ldr clustering.
 ; CHECK: ********** MI Scheduling **********
diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
index 41287a17da86..307d1ec1aa8c 100644
--- a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -disable-machine-dce -o - 2>&1 > /dev/null | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -disable-machine-dce -o - -misched-limit=2 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -disable-machine-dce -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -disable-machine-dce -o - -misched-limit=2 2>&1 > /dev/null | FileCheck %s
 ;
 ; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
 ; much higher than the ADD instructions in order to hide latency. When not
diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A57.ll b/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
index fac5f8ad2e9f..82ba18ce72ca 100644
--- a/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
@@ -6,7 +6,7 @@
 ; the loads to avoid unnecessary stalls. The generic machine model schedules 4
 ; loads consecutively for this case and will cause stalls.
 ;
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
 ; CHECK: ********** MI Scheduling **********
 ; CHECK: main:BB#2
 ; CHECK: LDR
diff --git a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
index 0ee74d1f782e..cde62fcb3f95 100644
--- a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
 ;
 ; For Cortex-A53, shiftable operands that are not actually shifted
 ; are not needed for an additional two cycles.
diff --git a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
index 0ec754f97ec7..748a4762d82f 100644
--- a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
+++ b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
 ;
 ; Test for bug in misched memory dependency calculation.
 ;
diff --git a/test/CodeGen/AArch64/arm64-misched-multimmo.ll b/test/CodeGen/AArch64/arm64-misched-multimmo.ll
index 3593668e0156..75f45da0e48f 100644
--- a/test/CodeGen/AArch64/arm64-misched-multimmo.ll
+++ b/test/CodeGen/AArch64/arm64-misched-multimmo.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
 
 
 @G1 = common global [100 x i32] zeroinitializer, align 4
diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
index ff7a0a8300e2..6b754b0a169e 100644
--- a/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -33,7 +33,7 @@ define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: sabdl2_8h:
-;CHECK: sabdl2.8h
+;CHECK: sabdl.8h
         %load1 = load <16 x i8>, <16 x i8>* %A
         %load2 = load <16 x i8>, <16 x i8>* %B
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -45,7 +45,7 @@ define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: sabdl2_4s:
-;CHECK: sabdl2.4s
+;CHECK: sabdl.4s
         %load1 = load <8 x i16>, <8 x i16>* %A
         %load2 = load <8 x i16>, <8 x i16>* %B
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -57,7 +57,7 @@ define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: sabdl2_2d:
-;CHECK: sabdl2.2d
+;CHECK: sabdl.2d
         %load1 = load <4 x i32>, <4 x i32>* %A
         %load2 = load <4 x i32>, <4 x i32>* %B
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -99,7 +99,7 @@ define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: uabdl2_8h:
-;CHECK: uabdl2.8h
+;CHECK: uabdl.8h
   %load1 = load <16 x i8>, <16 x i8>* %A
   %load2 = load <16 x i8>, <16 x i8>* %B
   %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -112,7 +112,7 @@ define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: uabdl2_4s:
-;CHECK: uabdl2.4s
+;CHECK: uabdl.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -124,7 +124,7 @@ define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: uabdl2_2d:
-;CHECK: uabdl2.2d
+;CHECK: uabdl.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -561,7 +561,7 @@ define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
 
 define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
 ;CHECK-LABEL: sabal2_8h:
-;CHECK: sabal2.8h
+;CHECK: sabal.8h
         %load1 = load <16 x i8>, <16 x i8>* %A
         %load2 = load <16 x i8>, <16 x i8>* %B
         %tmp3 = load <8 x i16>, <8 x i16>* %C
@@ -575,7 +575,7 @@ define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwin
 
 define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sabal2_4s:
-;CHECK: sabal2.4s
+;CHECK: sabal.4s
         %load1 = load <8 x i16>, <8 x i16>* %A
         %load2 = load <8 x i16>, <8 x i16>* %B
         %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -589,7 +589,7 @@ define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwin
 
 define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sabal2_2d:
-;CHECK: sabal2.2d
+;CHECK: sabal.2d
         %load1 = load <4 x i32>, <4 x i32>* %A
         %load2 = load <4 x i32>, <4 x i32>* %B
         %tmp3 = load <2 x i64>, <2 x i64>* %C
@@ -639,7 +639,7 @@ define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
 
 define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
 ;CHECK-LABEL: uabal2_8h:
-;CHECK: uabal2.8h
+;CHECK: uabal.8h
         %load1 = load <16 x i8>, <16 x i8>* %A
         %load2 = load <16 x i8>, <16 x i8>* %B
         %tmp3 = load <8 x i16>, <8 x i16>* %C
@@ -653,7 +653,7 @@ define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwin
 
 define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: uabal2_4s:
-;CHECK: uabal2.4s
+;CHECK: uabal.4s
         %load1 = load <8 x i16>, <8 x i16>* %A
         %load2 = load <8 x i16>, <8 x i16>* %B
         %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -667,7 +667,7 @@ define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwin
 
 define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: uabal2_2d:
-;CHECK: uabal2.2d
+;CHECK: uabal.2d
         %load1 = load <4 x i32>, <4 x i32>* %A
         %load2 = load <4 x i32>, <4 x i32>* %B
         %tmp3 = load <2 x i64>, <2 x i64>* %C
diff --git a/test/CodeGen/AArch64/arm64-vadd.ll b/test/CodeGen/AArch64/arm64-vadd.ll
index 9d09251524ea..2a25538250e4 100644
--- a/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/test/CodeGen/AArch64/arm64-vadd.ll
@@ -318,7 +318,7 @@ define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: uaddw2_8h:
-;CHECK: uaddw2.8h
+;CHECK: uaddw.8h
         %tmp1 = load <8 x i16>, <8 x i16>* %A
 
         %tmp2 = load <16 x i8>, <16 x i8>* %B
@@ -331,7 +331,7 @@ define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: uaddw2_4s:
-;CHECK: uaddw2.4s
+;CHECK: uaddw.4s
         %tmp1 = load <4 x i32>, <4 x i32>* %A
 
         %tmp2 = load <8 x i16>, <8 x i16>* %B
@@ -344,7 +344,7 @@ define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: uaddw2_2d:
-;CHECK: uaddw2.2d
+;CHECK: uaddw.2d
         %tmp1 = load <2 x i64>, <2 x i64>* %A
 
         %tmp2 = load <4 x i32>, <4 x i32>* %B
@@ -387,7 +387,7 @@ define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: saddw2_8h:
-;CHECK: saddw2.8h
+;CHECK: saddw.8h
         %tmp1 = load <8 x i16>, <8 x i16>* %A
 
         %tmp2 = load <16 x i8>, <16 x i8>* %B
@@ -400,7 +400,7 @@ define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: saddw2_4s:
-;CHECK: saddw2.4s
+;CHECK: saddw.4s
         %tmp1 = load <4 x i32>, <4 x i32>* %A
 
         %tmp2 = load <8 x i16>, <8 x i16>* %B
@@ -413,7 +413,7 @@ define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: saddw2_2d:
-;CHECK: saddw2.2d
+;CHECK: saddw.2d
         %tmp1 = load <2 x i64>, <2 x i64>* %A
 
         %tmp2 = load <4 x i32>, <4 x i32>* %B
diff --git a/test/CodeGen/AArch64/arm64-vmul.ll b/test/CodeGen/AArch64/arm64-vmul.ll
index a7668ec97979..f70ed9a43427 100644
--- a/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/test/CodeGen/AArch64/arm64-vmul.ll
@@ -83,7 +83,7 @@ define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqdmull2_4s:
-;CHECK: sqdmull2.4s
+;CHECK: sqdmull.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -94,7 +94,7 @@ define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: sqdmull2_2d:
-;CHECK: sqdmull2.2d
+;CHECK: sqdmull.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -324,7 +324,7 @@ define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin
 
 define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal2_4s:
-;CHECK: sqdmlal2.4s
+;CHECK: sqdmlal.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -337,7 +337,7 @@ define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw
 
 define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal2_2d:
-;CHECK: sqdmlal2.2d
+;CHECK: sqdmlal.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp3 = load <2 x i64>, <2 x i64>* %C
@@ -372,7 +372,7 @@ define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin
 
 define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlsl2_4s:
-;CHECK: sqdmlsl2.4s
+;CHECK: sqdmlsl.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -385,7 +385,7 @@ define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw
 
 define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sqdmlsl2_2d:
-;CHECK: sqdmlsl2.2d
+;CHECK: sqdmlsl.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp3 = load <2 x i64>, <2 x i64>* %C
@@ -874,7 +874,7 @@ define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqdmull2_lane_4s:
 ;CHECK-NOT: dup
-;CHECK: sqdmull2.4s
+;CHECK: sqdmull.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -886,7 +886,7 @@ define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: sqdmull2_lane_2d:
 ;CHECK-NOT: dup
-;CHECK: sqdmull2.2d
+;CHECK: sqdmull.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -994,7 +994,7 @@ define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n
 define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal2_lane_4s:
 ;CHECK-NOT: dup
-;CHECK: sqdmlal2.4s
+;CHECK: sqdmlal.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -1008,7 +1008,7 @@ define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C)
 define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal2_lane_2d:
 ;CHECK-NOT: dup
-;CHECK: sqdmlal2.2d
+;CHECK: sqdmlal.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp3 = load <2 x i64>, <2 x i64>* %C
@@ -1147,7 +1147,7 @@ define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n
 define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlsl2_lane_4s:
 ;CHECK-NOT: dup
-;CHECK: sqdmlsl2.4s
+;CHECK: sqdmlsl.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -1161,7 +1161,7 @@ define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C)
 define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sqdmlsl2_lane_2d:
 ;CHECK-NOT: dup
-;CHECK: sqdmlsl2.2d
+;CHECK: sqdmlsl.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp3 = load <2 x i64>, <2 x i64>* %C
diff --git a/test/CodeGen/AArch64/arm64-vshift.ll b/test/CodeGen/AArch64/arm64-vshift.ll
index c1c4649bd6a4..6b0fe40b5a09 100644
--- a/test/CodeGen/AArch64/arm64-vshift.ll
+++ b/test/CodeGen/AArch64/arm64-vshift.ll
@@ -1164,7 +1164,7 @@ define <2 x i64> @ushll2d(<2 x i32>* %A) nounwind {
 
 define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: ushll2_8h:
-;CHECK: ushll2.8h v0, {{v[0-9]+}}, #1
+;CHECK: ushll.8h v0, {{v[0-9]+}}, #1
         %load1 = load <16 x i8>, <16 x i8>* %A
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -1174,7 +1174,7 @@ define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
 
 define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: ushll2_4s:
-;CHECK: ushll2.4s v0, {{v[0-9]+}}, #1
+;CHECK: ushll.4s v0, {{v[0-9]+}}, #1
         %load1 = load <8 x i16>, <8 x i16>* %A
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
@@ -1184,7 +1184,7 @@ define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
 
 define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: ushll2_2d:
-;CHECK: ushll2.2d v0, {{v[0-9]+}}, #1
+;CHECK: ushll.2d v0, {{v[0-9]+}}, #1
         %load1 = load <4 x i32>, <4 x i32>* %A
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
@@ -1221,7 +1221,7 @@ define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind {
 
 define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: sshll2_8h:
-;CHECK: sshll2.8h v0, {{v[0-9]+}}, #1
+;CHECK: sshll.8h v0, {{v[0-9]+}}, #1
         %load1 = load <16 x i8>, <16 x i8>* %A
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
@@ -1231,7 +1231,7 @@ define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
 
 define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sshll2_4s:
-;CHECK: sshll2.4s v0, {{v[0-9]+}}, #1
+;CHECK: sshll.4s v0, {{v[0-9]+}}, #1
         %load1 = load <8 x i16>, <8 x i16>* %A
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
@@ -1241,7 +1241,7 @@ define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
 
 define <2 x i64> @sshll2_2d(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sshll2_2d:
-;CHECK: sshll2.2d v0, {{v[0-9]+}}, #1
+;CHECK: sshll.2d v0, {{v[0-9]+}}, #1
         %load1 = load <4 x i32>, <4 x i32>* %A
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
diff --git a/test/CodeGen/AArch64/arm64-vsub.ll b/test/CodeGen/AArch64/arm64-vsub.ll
index 7af69118347e..6746e49989cb 100644
--- a/test/CodeGen/AArch64/arm64-vsub.ll
+++ b/test/CodeGen/AArch64/arm64-vsub.ll
@@ -157,7 +157,7 @@ define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: ssubl2_8h:
-;CHECK: ssubl2.8h
+;CHECK: ssubl.8h
         %tmp1 = load <16 x i8>, <16 x i8>* %A
         %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %ext1 = sext <8 x i8> %high1 to <8 x i16>
@@ -172,7 +172,7 @@ define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: ssubl2_4s:
-;CHECK: ssubl2.4s
+;CHECK: ssubl.4s
         %tmp1 = load <8 x i16>, <8 x i16>* %A
         %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %ext1 = sext <4 x i16> %high1 to <4 x i32>
@@ -187,7 +187,7 @@ define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: ssubl2_2d:
-;CHECK: ssubl2.2d
+;CHECK: ssubl.2d
         %tmp1 = load <4 x i32>, <4 x i32>* %A
         %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %ext1 = sext <2 x i32> %high1 to <2 x i64>
@@ -235,7 +235,7 @@ define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: usubl2_8h:
-;CHECK: usubl2.8h
+;CHECK: usubl.8h
   %tmp1 = load <16 x i8>, <16 x i8>* %A
   %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %ext1 = zext <8 x i8> %high1 to <8 x i16>
@@ -250,7 +250,7 @@ define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: usubl2_4s:
-;CHECK: usubl2.4s
+;CHECK: usubl.4s
   %tmp1 = load <8 x i16>, <8 x i16>* %A
   %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %ext1 = zext <4 x i16> %high1 to <4 x i32>
@@ -265,7 +265,7 @@ define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: usubl2_2d:
-;CHECK: usubl2.2d
+;CHECK: usubl.2d
   %tmp1 = load <4 x i32>, <4 x i32>* %A
   %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %ext1 = zext <2 x i32> %high1 to <2 x i64>
@@ -310,7 +310,7 @@ define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: ssubw2_8h:
-;CHECK: ssubw2.8h
+;CHECK: ssubw.8h
         %tmp1 = load <8 x i16>, <8 x i16>* %A
 
         %tmp2 = load <16 x i8>, <16 x i8>* %B
@@ -323,7 +323,7 @@ define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: ssubw2_4s:
-;CHECK: ssubw2.4s
+;CHECK: ssubw.4s
         %tmp1 = load <4 x i32>, <4 x i32>* %A
 
         %tmp2 = load <8 x i16>, <8 x i16>* %B
@@ -336,7 +336,7 @@ define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: ssubw2_2d:
-;CHECK: ssubw2.2d
+;CHECK: ssubw.2d
         %tmp1 = load <2 x i64>, <2 x i64>* %A
 
         %tmp2 = load <4 x i32>, <4 x i32>* %B
@@ -379,7 +379,7 @@ define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: usubw2_8h:
-;CHECK: usubw2.8h
+;CHECK: usubw.8h
         %tmp1 = load <8 x i16>, <8 x i16>* %A
 
         %tmp2 = load <16 x i8>, <16 x i8>* %B
@@ -392,7 +392,7 @@ define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: usubw2_4s:
-;CHECK: usubw2.4s
+;CHECK: usubw.4s
         %tmp1 = load <4 x i32>, <4 x i32>* %A
 
         %tmp2 = load <8 x i16>, <8 x i16>* %B
@@ -405,7 +405,7 @@ define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: usubw2_2d:
-;CHECK: usubw2.2d
+;CHECK: usubw.2d
         %tmp1 = load <2 x i64>, <2 x i64>* %A
 
         %tmp2 = load <4 x i32>, <4 x i32>* %B
diff --git a/test/CodeGen/AArch64/asm-print-comments.ll b/test/CodeGen/AArch64/asm-print-comments.ll
new file mode 100644
index 000000000000..e997dce23583
--- /dev/null
+++ b/test/CodeGen/AArch64/asm-print-comments.ll
@@ -0,0 +1,17 @@
+; RUN: llc %s -mtriple=arm64-apple-darwin -o - | FileCheck %s
+
+; CHECK-LABEL: ; -- Begin function foo
+; CHECK: foo:
+define hidden i32 @foo() {
+  entry:
+  ret i32 30
+}
+; CHECK: ; -- End function
+
+; CHECK-LABEL: ; -- Begin function bar
+; CHECK: bar:
+define i32 @bar() {
+  entry:
+  ret i32 30
+}
+; CHECK: ; -- End function
diff --git a/test/CodeGen/AArch64/cmpxchg-O0.ll b/test/CodeGen/AArch64/cmpxchg-O0.ll
index 8432b15ea523..1bfbcf851c0e 100644
--- a/test/CodeGen/AArch64/cmpxchg-O0.ll
+++ b/test/CodeGen/AArch64/cmpxchg-O0.ll
@@ -3,10 +3,11 @@
 define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
 ; CHECK-LABEL: test_cmpxchg_8:
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK:     mov [[STATUS:w[3-9]+]], #0
 ; CHECK:     ldaxrb [[OLD:w[0-9]+]], [x0]
 ; CHECK:     cmp [[OLD]], w1, uxtb
 ; CHECK:     b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     stlxrb [[STATUS:w[3-9]]], w2, [x0]
+; CHECK:     stlxrb [[STATUS]], w2, [x0]
 ; CHECK:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK: [[DONE]]:
 ; CHECK:     subs {{w[0-9]+}}, [[OLD]], w1
@@ -18,6 +19,7 @@ define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
 define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind {
 ; CHECK-LABEL: test_cmpxchg_16:
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK:     mov [[STATUS:w[3-9]+]], #0
 ; CHECK:     ldaxrh [[OLD:w[0-9]+]], [x0]
 ; CHECK:     cmp [[OLD]], w1, uxth
 ; CHECK:     b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
@@ -33,10 +35,11 @@ define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind
 define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind {
 ; CHECK-LABEL: test_cmpxchg_32:
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK:     mov [[STATUS:w[3-9]+]], #0
 ; CHECK:     ldaxr [[OLD:w[0-9]+]], [x0]
 ; CHECK:     cmp [[OLD]], w1
 ; CHECK:     b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     stlxr [[STATUS:w[3-9]]], w2, [x0]
+; CHECK:     stlxr [[STATUS]], w2, [x0]
 ; CHECK:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK: [[DONE]]:
 ; CHECK:     subs {{w[0-9]+}}, [[OLD]], w1
@@ -48,10 +51,11 @@ define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind
 define { i64, i1 } @test_cmpxchg_64(i64* %addr, i64 %desired, i64 %new) nounwind {
 ; CHECK-LABEL: test_cmpxchg_64:
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK:     mov [[STATUS:w[3-9]+]], #0
 ; CHECK:     ldaxr [[OLD:x[0-9]+]], [x0]
 ; CHECK:     cmp [[OLD]], x1
 ; CHECK:     b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     stlxr [[STATUS:w[3-9]]], x2, [x0]
+; CHECK:     stlxr [[STATUS]], x2, [x0]
 ; CHECK:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK: [[DONE]]:
 ; CHECK:     subs {{x[0-9]+}}, [[OLD]], x1
diff --git a/test/CodeGen/AArch64/fast-isel-cmpxchg.ll b/test/CodeGen/AArch64/fast-isel-cmpxchg.ll
index aa78210fae74..7ef625abab20 100644
--- a/test/CodeGen/AArch64/fast-isel-cmpxchg.ll
+++ b/test/CodeGen/AArch64/fast-isel-cmpxchg.ll
@@ -2,11 +2,12 @@
 
 ; CHECK-LABEL: cmpxchg_monotonic_32:
 ; CHECK: [[RETRY:.LBB[0-9_]+]]:
+; CHECK-NEXT:     mov [[STATUS:w[0-9]+]], #0
 ; CHECK-NEXT:     ldaxr [[OLD:w[0-9]+]], [x0]
 ; CHECK-NEXT:     cmp [[OLD]], w1
 ; CHECK-NEXT:     b.ne [[DONE:.LBB[0-9_]+]]
 ; CHECK-NEXT: // BB#2:
-; CHECK-NEXT:     stlxr [[STATUS:w[0-9]+]], w2, [x0]
+; CHECK-NEXT:     stlxr [[STATUS]], w2, [x0]
 ; CHECK-NEXT:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK-NEXT: [[DONE]]:
 ; CHECK-NEXT:     cmp [[OLD]], w1
@@ -27,11 +28,12 @@ define i32 @cmpxchg_monotonic_32(i32* %p, i32 %cmp, i32 %new, i32* %ps) #0 {
 ; CHECK:      // BB#0:
 ; CHECK:     ldr [[NEW:w[0-9]+]], [x2]
 ; CHECK-NEXT: [[RETRY:.LBB[0-9_]+]]:
+; CHECK-NEXT:     mov [[STATUS:w[0-9]+]], #0
 ; CHECK-NEXT:     ldaxr [[OLD:w[0-9]+]], [x0]
 ; CHECK-NEXT:     cmp [[OLD]], w1
 ; CHECK-NEXT:     b.ne [[DONE:.LBB[0-9_]+]]
 ; CHECK-NEXT: // BB#2:
-; CHECK-NEXT:     stlxr [[STATUS:w[0-9]+]], [[NEW]], [x0]
+; CHECK-NEXT:     stlxr [[STATUS]], [[NEW]], [x0]
 ; CHECK-NEXT:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK-NEXT: [[DONE]]:
 ; CHECK-NEXT:     cmp [[OLD]], w1
@@ -51,11 +53,12 @@ define i32 @cmpxchg_acq_rel_32_load(i32* %p, i32 %cmp, i32* %pnew, i32* %ps) #0
 
 ; CHECK-LABEL: cmpxchg_seq_cst_64:
 ; CHECK: [[RETRY:.LBB[0-9_]+]]:
+; CHECK-NEXT:     mov [[STATUS:w[0-9]+]], #0
 ; CHECK-NEXT:     ldaxr [[OLD:x[0-9]+]], [x0]
 ; CHECK-NEXT:     cmp [[OLD]], x1
 ; CHECK-NEXT:     b.ne [[DONE:.LBB[0-9_]+]]
 ; CHECK-NEXT: // BB#2:
-; CHECK-NEXT:     stlxr [[STATUS:w[0-9]+]], x2, [x0]
+; CHECK-NEXT:     stlxr [[STATUS]], x2, [x0]
 ; CHECK-NEXT:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK-NEXT: [[DONE]]:
 ; CHECK-NEXT:     cmp [[OLD]], x1
diff --git a/test/CodeGen/AArch64/live-interval-analysis.mir b/test/CodeGen/AArch64/live-interval-analysis.mir
index d44300973566..93dfcf507fff 100644
--- a/test/CodeGen/AArch64/live-interval-analysis.mir
+++ b/test/CodeGen/AArch64/live-interval-analysis.mir
@@ -6,7 +6,7 @@
 ---
 # CHECK-LABEL: ********** INTERVALS **********
 # W29 is reserved, so we should only see dead defs
-# CHECK-DAG: W29 [0B,0d:{{[0-9]+}})[32r,32d:{{[0-9]+}})[64r,64d:{{[0-9]+}})
+# CHECK-DAG: W29 [32r,32d:{{[0-9]+}})[64r,64d:{{[0-9]+}})
 # For normal registers like x28 we should see the full intervals
 # CHECK-DAG: W28 [0B,16r:{{[0-9]+}})[32r,48r:{{[0-9]+}})[48r,48d:{{[0-9]+}})
 # CHECK: # End machine code for function reserved_reg_liveness.
@@ -14,7 +14,7 @@ name: reserved_reg_liveness
 tracksRegLiveness: true
 body: |
   bb.0:
-    liveins: %x28_fp
+    liveins: %x28
     %6 : xseqpairsclass = COPY %x28_fp
     %x28_fp = COPY %6
     %x28 = COPY %x28
diff --git a/test/CodeGen/AArch64/misched-fusion-aes.ll b/test/CodeGen/AArch64/misched-fusion-aes.ll
index 4c682e594e66..1d8787212579 100644
--- a/test/CodeGen/AArch64/misched-fusion-aes.ll
+++ b/test/CodeGen/AArch64/misched-fusion-aes.ll
@@ -1,5 +1,5 @@
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA72
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57A72
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57A72
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1
 
 declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k)
@@ -72,55 +72,40 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
   ret void
 
 ; CHECK-LABEL: aesea:
-; CHECKA57: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
-; CHECKA57: aesmc {{v[0-7].16b}}, [[VA]]
-; CHECKA57: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
-; CHECKA57: aesmc {{v[0-7].16b}}, [[VB]]
-; CHECKA57: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
-; CHECKA57: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
-; CHECKA57: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
-; CHECKA57: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
-; CHECKA72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
-; CHECKA72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
-; CHECKA72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
-; CHECKA72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
-; CHECKA72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
-; CHECKA72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
-; CHECKA72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
-; CHECKA72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
+; CHECKA57A72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECKA57A72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
+; CHECKA57A72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
+; CHECKA57A72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
+; CHECKA57A72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
+; CHECKA57A72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
+; CHECKA57A72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
+; CHECKA57A72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
+
 ; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECKM1: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
 ; CHECKM1: aese {{v[0-7].16b}}, {{v[0-7].16b}}
 ; CHECKM1: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
 ; CHECKM1: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1: aesmc {{v[0-7].16b}}, [[VD]]
+; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
+; CHECKM1: aesmc {{v[0-7].16b}}, [[VH]]
 ; CHECKM1: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
 ; CHECKM1: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
 ; CHECKM1: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
-; CHECKM1: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
 }
 
 define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) {
@@ -188,53 +173,65 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
   ret void
 
 ; CHECK-LABEL: aesda:
-; CHECKA57: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
-; CHECKA57: aesimc {{v[0-7].16b}}, [[VA]]
-; CHECKA57: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
-; CHECKA57: aesimc {{v[0-7].16b}}, [[VB]]
-; CHECKA57: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
-; CHECKA57: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
-; CHECKA57: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
-; CHECKA57: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
-; CHECKA72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
-; CHECKA72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
-; CHECKA72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
-; CHECKA72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
-; CHECKA72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
-; CHECKA72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
-; CHECKA72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
-; CHECKA72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
+; CHECKA57A72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
+; CHECKA57A72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
+; CHECKA57A72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
+; CHECKA57A72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
+; CHECKA57A72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
+; CHECKA57A72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
+; CHECKA57A72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
+; CHECKA57A72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
+
 ; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1: aesimc {{v[0-7].16b}}, [[VA]]
+; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
+; CHECKM1: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
 ; CHECKM1: aesd {{v[0-7].16b}}, {{v[0-7].16b}}
 ; CHECKM1: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
 ; CHECKM1: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1: aesimc {{v[0-7].16b}}, [[VD]]
+; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
+; CHECKM1: aesimc {{v[0-7].16b}}, [[VH]]
 ; CHECKM1: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
 ; CHECKM1: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
 ; CHECKM1: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
-; CHECKM1: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
+}
+
+define void @aes_load_store(<16 x i8> *%p1, <16 x i8> *%p2 , <16 x i8> *%p3) {
+entry:
+  %x1 = alloca <16 x i8>, align 16
+  %x2 = alloca <16 x i8>, align 16
+  %x3 = alloca <16 x i8>, align 16
+  %x4 = alloca <16 x i8>, align 16
+  %x5 = alloca <16 x i8>, align 16
+  %in1 = load <16 x i8>, <16 x i8>* %p1, align 16
+  store <16 x i8> %in1, <16 x i8>* %x1, align 16
+  %aese1 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %in1, <16 x i8> %in1) #2
+  store <16 x i8> %aese1, <16 x i8>* %x2, align 16
+  %in2 = load <16 x i8>, <16 x i8>* %p2, align 16
+  %aesmc1= call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %aese1) #2
+  store <16 x i8> %aesmc1, <16 x i8>* %x3, align 16
+  %aese2 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %in1, <16 x i8> %in2) #2
+  store <16 x i8> %aese2, <16 x i8>* %x4, align 16
+  %aesmc2= call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %aese2) #2
+  store <16 x i8> %aesmc2, <16 x i8>* %x5, align 16
+  ret void
+
+; CHECK-LABEL: aes_load_store:
+; CHECK: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECK: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
 }
diff --git a/test/CodeGen/AArch64/optimize-imm.ll b/test/CodeGen/AArch64/optimize-imm.ll
index a4725c65aa26..f960a3a95fc9 100644
--- a/test/CodeGen/AArch64/optimize-imm.ll
+++ b/test/CodeGen/AArch64/optimize-imm.ll
@@ -62,3 +62,22 @@ entry:
   %and = xor i32 %xor, 56
   ret i32 %and
 }
+
+; Check that, when (and %t1, 129) is transformed to (and %t0, 0),
+; (xor %arg, 129) doesn't get transformed to (xor %arg, 0).
+;
+; CHECK-LABEL: PR33100:
+; CHECK: mov w[[R0:[0-9]+]], #129
+; CHECK: eor {{x[0-9]+}}, {{x[0-9]+}}, x[[R0]]
+
+define i64 @PR33100(i64 %arg) {
+entry:
+  %alloca0 = alloca i64
+  store i64 8, i64* %alloca0, align 4
+  %t0 = load i64, i64* %alloca0, align 4
+  %t1 = shl i64 %arg, %t0
+  %and0 = and i64 %t1, 129
+  %xor0 = xor i64 %arg, 129
+  %t2 = add i64 %and0, %xor0
+  ret i64 %t2
+}
diff --git a/test/CodeGen/AArch64/scheduledag-constreg.mir b/test/CodeGen/AArch64/scheduledag-constreg.mir
index 23c785504f01..6b83dc715e0a 100644
--- a/test/CodeGen/AArch64/scheduledag-constreg.mir
+++ b/test/CodeGen/AArch64/scheduledag-constreg.mir
@@ -1,4 +1,4 @@
-# RUN: llc -o /dev/null %s -mtriple=aarch64-- -run-pass=machine-scheduler -enable-misched -debug-only=misched 2>&1 | FileCheck %s
+# RUN: llc -o /dev/null %s -mtriple=aarch64-- -run-pass=machine-scheduler -enable-misched -debug-only=machine-scheduler 2>&1 | FileCheck %s
 # REQUIRES: asserts
 --- |
   define void @func() { ret void }
diff --git a/test/CodeGen/AArch64/tailcall_misched_graph.ll b/test/CodeGen/AArch64/tailcall_misched_graph.ll
index 0e4eb2b5fad9..4fbd8944f032 100644
--- a/test/CodeGen/AArch64/tailcall_misched_graph.ll
+++ b/test/CodeGen/AArch64/tailcall_misched_graph.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=cyclone -debug-only=misched < %s 2>&1 | FileCheck %s
+; RUN: llc -mcpu=cyclone -debug-only=machine-scheduler < %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
index 8839ba8e0ab2..0557008ceb4f 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
@@ -5,6 +5,11 @@
   entry:
     ret void
   }
+
+  define void @test_fconstant() {
+  entry:
+    ret void
+  }
 ...
 
 ---
@@ -18,3 +23,18 @@ body: |
 
     %0(s32) = G_CONSTANT i32 5
 ...
+
+---
+name:            test_fconstant
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body: |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_fconstant
+    ; CHECK: %0(s32) = G_FCONSTANT  float 1.000000e+00
+    ; CHECK: %1(s32) = G_FCONSTANT  float 7.5
+
+    %0(s32) = G_FCONSTANT float 1.0
+    %1(s32) = G_FCONSTANT float 7.5
+...
diff --git a/test/CodeGen/AMDGPU/bfe-combine.ll b/test/CodeGen/AMDGPU/bfe-combine.ll
new file mode 100644
index 000000000000..791b49f0e143
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bfe-combine.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck --check-prefix=GCN --check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN --check-prefix=CI %s
+
+; GCN-LABEL: {{^}}bfe_combine8:
+; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 8, 8
+; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], 2, v[[BFE]]
+; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 6, v{{[0-9]+}}
+; CI: v_and_b32_e32 v[[ADDRLO:[0-9]+]], 0x3fc, v[[SHR]]
+; VI: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
+; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %idx = add i32 %x, %id
+  %srl = lshr i32 %idx, 8
+  %and = and i32 %srl, 255
+  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %and
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_combine16:
+; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 16, 16
+; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], {{[^,]+}}, v[[BFE]]
+; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 1, v{{[0-9]+}}
+; CI: v_and_b32_e32 v[[AND:[0-9]+]], 0x7fff8000, v[[SHR]]
+; CI: v_lshl_b64 v{{\[}}[[ADDRLO:[0-9]+]]:{{[^\]+}}], v{{\[}}[[AND]]:{{[^\]+}}], 2
+; VI: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
+; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+define amdgpu_kernel void @bfe_combine16(i32 addrspace(1)* nocapture %arg, i32 %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %idx = add i32 %x, %id
+  %srl = lshr i32 %idx, 1
+  %and = and i32 %srl, 2147450880
+  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %and
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/test/CodeGen/AMDGPU/extload-align.ll b/test/CodeGen/AMDGPU/extload-align.ll
index 4644800421d8..12cf27b918af 100644
--- a/test/CodeGen/AMDGPU/extload-align.ll
+++ b/test/CodeGen/AMDGPU/extload-align.ll
@@ -1,4 +1,4 @@
-; RUN: llc -debug-only=misched -march=amdgcn -verify-machineinstrs %s -o - 2>&1| FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC -check-prefix=DEBUG %s
+; RUN: llc -debug-only=machine-scheduler -march=amdgcn -verify-machineinstrs %s -o - 2>&1| FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC -check-prefix=DEBUG %s
 ; REQUIRES: asserts
 
 ; Verify that the extload generated from %eval has the default
@@ -20,4 +20,4 @@ define amdgpu_kernel void @extload_align(i32* %out, i32 %index) #0 {
   %eval = sext i16 %val to i32
   store i32 %eval, i32* %out
   ret void
-}
-\ No newline at end of file
+}
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index 6fa26cb38793..9441bf208829 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=SI,GCN,MESA-GCN,FUNC
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,HSA-VI,FUNC
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}i8_arg:
 ; HSA-VI: kernarg_segment_alignment = 4
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll
new file mode 100644
index 000000000000..22e15e216805
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i64 @llvm.amdgcn.s.getpc() #0
+
+; GCN-LABEL: {{^}}test_s_getpc:
+; GCN: s_load_dwordx2
+; GCN-DAG: s_getpc_b64 s{{\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2
+define amdgpu_kernel void @test_s_getpc(i64 addrspace(1)* %out) #0 {
+  %tmp = call i64 @llvm.amdgcn.s.getpc() #1
+  store volatile i64 %tmp, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll
index 5dd2efdf6382..72fde04ba391 100644
--- a/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}constant_load_i16:
 ; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/load-constant-i8.ll b/test/CodeGen/AMDGPU/load-constant-i8.ll
index 6e56b9f9b6d6..bdfc3caf9d01 100644
--- a/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}constant_load_i8:
diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll
index dcdd1a947cd4..e3415b9c47de 100644
--- a/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
 
 ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
 
diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll
index 71adf090532f..fc0cbf916b52 100644
--- a/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}global_load_i8:
diff --git a/test/CodeGen/AMDGPU/load-local-i16.ll b/test/CodeGen/AMDGPU/load-local-i16.ll
index bbbb34e8d333..7de3f3b28c6d 100644
--- a/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}local_load_i16:
 ; GCN: ds_read_u16 v{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/load-local-i8.ll b/test/CodeGen/AMDGPU/load-local-i8.ll
index 731996ec6c45..16eb366a4b15 100644
--- a/test/CodeGen/AMDGPU/load-local-i8.ll
+++ b/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}local_load_i8:
diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll
index e85a724c1567..60e43f8fb2a7 100644
--- a/test/CodeGen/AMDGPU/min.ll
+++ b/test/CodeGen/AMDGPU/min.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
 ; GCN: v_min_i32_e32
diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
index a90f200f79e3..190d2b72ebaf 100644
--- a/test/CodeGen/AMDGPU/parallelandifcollapse.ll
+++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck %s
 ;
 ; CFG flattening should use parallel-and mode to generate branch conditions and
 ; then merge if-regions with the same bodies.
diff --git a/test/CodeGen/AMDGPU/r600.bitcast.ll b/test/CodeGen/AMDGPU/r600.bitcast.ll
index acf7a66a357f..67431e6a4825 100644
--- a/test/CodeGen/AMDGPU/r600.bitcast.ll
+++ b/test/CodeGen/AMDGPU/r600.bitcast.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; This test just checks that the compiler doesn't crash.
 
diff --git a/test/CodeGen/AMDGPU/schedule-regpressure.mir b/test/CodeGen/AMDGPU/schedule-regpressure.mir
index c71de87eeece..3a20ec732e5b 100644
--- a/test/CodeGen/AMDGPU/schedule-regpressure.mir
+++ b/test/CodeGen/AMDGPU/schedule-regpressure.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=amdgcn -misched=converge -run-pass machine-scheduler %s -o - -debug-only=misched 2>&1 | FileCheck %s
+# RUN: llc -march=amdgcn -misched=converge -run-pass machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s
 # REQUIRES: asserts
 
 # Check there is no SReg_32 pressure created by DS_* instructions because of M0 use
diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll
index add90e9c2f3a..f63719d62a84 100644
--- a/test/CodeGen/AMDGPU/setcc.ll
+++ b/test/CodeGen/AMDGPU/setcc.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
diff --git a/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll b/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
new file mode 100644
index 000000000000..1cdfec9fdb59
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck %s
+
+; Check transformation shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)
+; Only one shift if expected, GEP shall not produce a separate shift
+
+; CHECK-LABEL: {{^}}add_const_offset:
+; CHECK: v_lshlrev_b32_e32 v[[SHL:[0-9]+]], 4, v0
+; CHECK: v_add_i32_e32 v[[ADD:[0-9]+]], vcc, 0xc80, v[[SHL]]
+; CHECK-NOT: v_lshl
+; CHECK: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADD]]
+; CHECK: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+define amdgpu_kernel void @add_const_offset(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %id, 200
+  %shl = shl i32 %add, 2
+  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %shl
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}or_const_offset:
+; CHECK: v_lshlrev_b32_e32 v[[SHL:[0-9]+]], 4, v0
+; CHECK: v_or_b32_e32 v[[OR:[0-9]+]], 0x1000, v[[SHL]]
+; CHECK-NOT: v_lshl
+; CHECK: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[OR]]
+; CHECK: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+define amdgpu_kernel void @or_const_offset(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %add = or i32 %id, 256
+  %shl = shl i32 %add, 2
+  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %shl
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index ff666cc3653b..edc313ee323b 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
 ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index b4355b76016a..44cfdf6398ae 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 
diff --git a/test/CodeGen/AMDGPU/store-global.ll b/test/CodeGen/AMDGPU/store-global.ll
index 160e921fc075..f61e524ee2e5 100644
--- a/test/CodeGen/AMDGPU/store-global.ll
+++ b/test/CodeGen/AMDGPU/store-global.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}store_i1:
 ; EG: MEM_RAT MSKOR
diff --git a/test/CodeGen/AMDGPU/store-private.ll b/test/CodeGen/AMDGPU/store-private.ll
index ab73ada370ea..ce7656adc0b4 100644
--- a/test/CodeGen/AMDGPU/store-private.ll
+++ b/test/CodeGen/AMDGPU/store-private.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}store_i1:
 ; EG: MOVA_INT
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll
index f366029fdea2..e7655df15520 100644
--- a/test/CodeGen/AMDGPU/sub.ll
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
 declare i32 @llvm.r600.read.tidig.x() readnone
diff --git a/test/CodeGen/AMDGPU/unknown-processor.ll b/test/CodeGen/AMDGPU/unknown-processor.ll
index 25a700a943d2..e25f2235993f 100644
--- a/test/CodeGen/AMDGPU/unknown-processor.ll
+++ b/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=unknown < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
-; RUN: llc -march=r600 -mcpu=unknown < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
+; RUN: llc -march=amdgcn -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
+; RUN: llc -march=r600 -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
 
 ; Should not crash when the processor is not recognized and the
 ; wavefront size feature not set.
diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll
index 03cf725601b7..a0aac8c1d9ba 100644
--- a/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -138,3 +138,25 @@ entry:
   store float %tmp2, float addrspace(1)* %out
   ret void
 }
+
+; The pointer arguments in local address space should not affect promotion to vector.
+
+; OPT-LABEL: @vector_read_with_local_arg(
+; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %tmp = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 1, i32* %y
+  store i32 2, i32* %z
+  store i32 3, i32* %w
+  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
+  %tmp2 = load i32, i32* %tmp1
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
index 1a0c7fd8e1d6..f4aba880ff76 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
@@ -620,6 +620,360 @@ entry:
   ret float %r
 }
 
+declare arm_aapcscc [3 x i32] @tiny_int_arrays_target([2 x i32])
+
+define arm_aapcscc [3 x i32] @test_tiny_int_arrays([2 x i32] %arr) {
+; CHECK-LABEL: name: test_tiny_int_arrays
+; CHECK: liveins: %r0, %r1
+; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK: [[ARG_ARR0:%[0-9]+]](s64) = IMPLICIT_DEF
+; CHECK: [[ARG_ARR1:%[0-9]+]](s64) = G_INSERT [[ARG_ARR0]], [[R0]](s32), 0
+; CHECK: [[ARG_ARR2:%[0-9]+]](s64) = G_INSERT [[ARG_ARR1]], [[R1]](s32), 32
+; CHECK: [[ARG_ARR:%[0-9]+]](s64) = COPY [[ARG_ARR2]]
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s64), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s64), 32
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: BLX @tiny_int_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit-def %r0, implicit-def %r1
+; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK: [[R2:%[0-9]+]](s32) = COPY %r2
+; CHECK: [[RES_ARR0:%[0-9]+]](s96) = IMPLICIT_DEF
+; CHECK: [[RES_ARR1:%[0-9]+]](s96) = G_INSERT [[RES_ARR0]], [[R0]](s32), 0
+; CHECK: [[RES_ARR2:%[0-9]+]](s96) = G_INSERT [[RES_ARR1]], [[R1]](s32), 32
+; CHECK: [[RES_ARR3:%[0-9]+]](s96) = G_INSERT [[RES_ARR2]], [[R2]](s32), 64
+; CHECK: [[RES_ARR:%[0-9]+]](s96) = COPY [[RES_ARR3]]
+; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s96), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s96), 32
+; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s96), 64
+; FIXME: This doesn't seem correct with regard to the AAPCS docs (which say
+; that composite types larger than 4 bytes should be passed through memory),
+; but it's what DAGISel does. We should fix it in the common code for both.
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: %r2 = COPY [[R2]]
+; CHECK: BX_RET 14, _, implicit %r0, implicit %r1, implicit %r2
+entry:
+  %r = notail call arm_aapcscc [3 x i32] @tiny_int_arrays_target([2 x i32] %arr)
+  ret [3 x i32] %r
+}
+
+declare arm_aapcscc void @multiple_int_arrays_target([2 x i32], [2 x i32])
+
+define arm_aapcscc void @test_multiple_int_arrays([2 x i32] %arr0, [2 x i32] %arr1) {
+; CHECK-LABEL: name: test_multiple_int_arrays
+; CHECK: liveins: %r0, %r1
+; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK: [[R2:%[0-9]+]](s32) = COPY %r2
+; CHECK: [[R3:%[0-9]+]](s32) = COPY %r3
+; CHECK: [[ARG_ARR0_0:%[0-9]+]](s64) = IMPLICIT_DEF
+; CHECK: [[ARG_ARR0_1:%[0-9]+]](s64) = G_INSERT [[ARG_ARR0_0]], [[R0]](s32), 0
+; CHECK: [[ARG_ARR0_2:%[0-9]+]](s64) = G_INSERT [[ARG_ARR0_1]], [[R1]](s32), 32
+; CHECK: [[ARG_ARR0:%[0-9]+]](s64) = COPY [[ARG_ARR0_2]]
+; CHECK: [[ARG_ARR1_0:%[0-9]+]](s64) = IMPLICIT_DEF
+; CHECK: [[ARG_ARR1_1:%[0-9]+]](s64) = G_INSERT [[ARG_ARR1_0]], [[R2]](s32), 0
+; CHECK: [[ARG_ARR1_2:%[0-9]+]](s64) = G_INSERT [[ARG_ARR1_1]], [[R3]](s32), 32
+; CHECK: [[ARG_ARR1:%[0-9]+]](s64) = COPY [[ARG_ARR1_2]]
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR0]](s64), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR0]](s64), 32
+; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR1]](s64), 0
+; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR1]](s64), 32
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: %r2 = COPY [[R2]]
+; CHECK: %r3 = COPY [[R3]]
+; CHECK: BLX @multiple_int_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3
+; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: BX_RET 14, _
+entry:
+  notail call arm_aapcscc void @multiple_int_arrays_target([2 x i32] %arr0, [2 x i32] %arr1)
+  ret void
+}
+
+declare arm_aapcscc void @large_int_arrays_target([20 x i32])
+
+define arm_aapcscc void @test_large_int_arrays([20 x i32] %arr) {
+; CHECK-LABEL: name: test_large_int_arrays
+; CHECK: fixedStack:
+; The parameters live in separate stack locations, one for each element that
+; doesn't fit in the registers.
+; CHECK-DAG: id: [[FIRST_STACK_ID:[0-9]+]], offset: 0, size: 4
+; CHECK-DAG: id: [[LAST_STACK_ID:[-0]+]], offset: 60, size: 4
+; CHECK: liveins: %r0, %r1, %r2, %r3
+; CHECK-DAG: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK-DAG: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK-DAG: [[R2:%[0-9]+]](s32) = COPY %r2
+; CHECK-DAG: [[R3:%[0-9]+]](s32) = COPY %r3
+; CHECK: [[FIRST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[FIRST_STACK_ID]]
+; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[FIRST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[FIRST_STACK_ID]]
+; CHECK: [[LAST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[LAST_STACK_ID]]
+; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[LAST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[LAST_STACK_ID]]
+; CHECK: [[ARG_ARR0:%[0-9]+]](s640) = IMPLICIT_DEF
+; CHECK: [[ARG_ARR1:%[0-9]+]](s640) = G_INSERT [[ARG_ARR0]], [[R0]](s32), 0
+; CHECK: [[ARG_ARR2:%[0-9]+]](s640) = G_INSERT [[ARG_ARR1]], [[R1]](s32), 32
+; CHECK: [[ARG_ARR3:%[0-9]+]](s640) = G_INSERT [[ARG_ARR2]], [[R2]](s32), 64
+; CHECK: [[ARG_ARR4:%[0-9]+]](s640) = G_INSERT [[ARG_ARR3]], [[R3]](s32), 96
+; CHECK: [[ARG_ARR5:%[0-9]+]](s640) = G_INSERT [[ARG_ARR4]], [[FIRST_STACK_ELEMENT]](s32), 128
+; CHECK: [[ARG_ARR6:%[0-9]+]](s640) = G_INSERT {{%[0-9]+}}, [[LAST_STACK_ELEMENT]](s32), 608
+; CHECK: [[ARG_ARR:%[0-9]+]](s640) = COPY [[ARG_ARR6]]
+; CHECK: ADJCALLSTACKDOWN 64, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 32
+; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 64
+; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 96
+; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 128
+; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 608
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: %r2 = COPY [[R2]]
+; CHECK: %r3 = COPY [[R3]]
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF_FIRST_ELEMENT:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[FIRST_STACK_ARG_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[OFF_FIRST_ELEMENT]](s32)
+; CHECK: G_STORE [[FIRST_STACK_ELEMENT]](s32), [[FIRST_STACK_ARG_ADDR]]{{.*}}store 4
+; Match the second-to-last offset, so we can get the correct SP for the last element
+; CHECK: G_CONSTANT i32 56
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF_LAST_ELEMENT:%[0-9]+]](s32) = G_CONSTANT i32 60
+; CHECK: [[LAST_STACK_ARG_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[OFF_LAST_ELEMENT]](s32)
+; CHECK: G_STORE [[LAST_STACK_ELEMENT]](s32), [[LAST_STACK_ARG_ADDR]]{{.*}}store 4
+; CHECK: BLX @large_int_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3
+; CHECK: ADJCALLSTACKUP 64, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: BX_RET 14, _
+entry:
+  notail call arm_aapcscc void @large_int_arrays_target([20 x i32] %arr)
+  ret void
+}
+
+declare arm_aapcscc [2 x float] @fp_arrays_aapcs_target([3 x double])
+
+define arm_aapcscc [2 x float] @test_fp_arrays_aapcs([3 x double] %arr) {
+; CHECK-LABEL: name: test_fp_arrays_aapcs
+; CHECK: fixedStack:
+; CHECK: id: [[ARR2_ID:[0-9]+]], offset: 0, size: 8
+; CHECK: liveins: %r0, %r1, %r2, %r3
+; CHECK: [[ARR0_0:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[ARR0_1:%[0-9]+]](s32) = COPY %r1
+; LITTLE: [[ARR0:%[0-9]+]](s64) = G_SEQUENCE [[ARR0_0]](s32), 0, [[ARR0_1]](s32), 32
+; BIG: [[ARR0:%[0-9]+]](s64) = G_SEQUENCE [[ARR0_1]](s32), 0, [[ARR0_0]](s32), 32
+; CHECK: [[ARR1_0:%[0-9]+]](s32) = COPY %r2
+; CHECK: [[ARR1_1:%[0-9]+]](s32) = COPY %r3
+; LITTLE: [[ARR1:%[0-9]+]](s64) = G_SEQUENCE [[ARR1_0]](s32), 0, [[ARR1_1]](s32), 32
+; BIG: [[ARR1:%[0-9]+]](s64) = G_SEQUENCE [[ARR1_1]](s32), 0, [[ARR1_0]](s32), 32
+; CHECK: [[ARR2_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[ARR2_ID]]
+; CHECK: [[ARR2:%[0-9]+]](s64) = G_LOAD [[ARR2_FI]]{{.*}}load 8 from %fixed-stack.[[ARR2_ID]]
+; CHECK: [[ARR_MERGED_0:%[0-9]+]](s192) = IMPLICIT_DEF
+; CHECK: [[ARR_MERGED_1:%[0-9]+]](s192) = G_INSERT [[ARR_MERGED_0]], [[ARR0]](s64), 0
+; CHECK: [[ARR_MERGED_2:%[0-9]+]](s192) = G_INSERT [[ARR_MERGED_1]], [[ARR1]](s64), 64
+; CHECK: [[ARR_MERGED_3:%[0-9]+]](s192) = G_INSERT [[ARR_MERGED_2]], [[ARR2]](s64), 128
+; CHECK: [[ARR_MERGED:%[0-9]+]](s192) = COPY [[ARR_MERGED_3]]
+; CHECK: ADJCALLSTACKDOWN 8, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[ARR0:%[0-9]+]](s64) = G_EXTRACT [[ARR_MERGED]](s192), 0
+; CHECK: [[ARR1:%[0-9]+]](s64) = G_EXTRACT [[ARR_MERGED]](s192), 64
+; CHECK: [[ARR2:%[0-9]+]](s64) = G_EXTRACT [[ARR_MERGED]](s192), 128
+; CHECK: [[ARR0_0:%[0-9]+]](s32) = G_EXTRACT [[ARR0]](s64), 0
+; CHECK: [[ARR0_1:%[0-9]+]](s32) = G_EXTRACT [[ARR0]](s64), 32
+; LITTLE: %r0 = COPY [[ARR0_0]](s32)
+; LITTLE: %r1 = COPY [[ARR0_1]](s32)
+; BIG: %r0 = COPY [[ARR0_1]](s32)
+; BIG: %r1 = COPY [[ARR0_0]](s32)
+; CHECK: [[ARR1_0:%[0-9]+]](s32) = G_EXTRACT [[ARR1]](s64), 0
+; CHECK: [[ARR1_1:%[0-9]+]](s32) = G_EXTRACT [[ARR1]](s64), 32
+; LITTLE: %r2 = COPY [[ARR1_0]](s32)
+; LITTLE: %r3 = COPY [[ARR1_1]](s32)
+; BIG: %r2 = COPY [[ARR1_1]](s32)
+; BIG: %r3 = COPY [[ARR1_0]](s32)
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[ARR2_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[ARR2_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[ARR2_OFFSET]](s32)
+; CHECK: G_STORE [[ARR2]](s64), [[ARR2_ADDR]](p0){{.*}}store 8
+; CHECK: BLX @fp_arrays_aapcs_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
+; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK: [[R_MERGED_0:%[0-9]+]](s64) = IMPLICIT_DEF
+; CHECK: [[R_MERGED_1:%[0-9]+]](s64) = G_INSERT [[R_MERGED_0]], [[R0]](s32), 0
+; CHECK: [[R_MERGED_2:%[0-9]+]](s64) = G_INSERT [[R_MERGED_1]], [[R1]](s32), 32
+; CHECK: [[R_MERGED:%[0-9]+]](s64) = COPY [[R_MERGED_2]]
+; CHECK: ADJCALLSTACKUP 8, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s64), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s64), 32
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: BX_RET 14, _, implicit %r0, implicit %r1
+entry:
+  %r = notail call arm_aapcscc [2 x float] @fp_arrays_aapcs_target([3 x double] %arr)
+  ret [2 x float] %r
+}
+
+declare arm_aapcs_vfpcc [4 x float] @fp_arrays_aapcs_vfp_target([3 x double], [3 x float], [4 x double])
+
+define arm_aapcs_vfpcc [4 x float] @test_fp_arrays_aapcs_vfp([3 x double] %x, [3 x float] %y, [4 x double] %z) {
+; CHECK-LABEL: name: test_fp_arrays_aapcs_vfp
+; CHECK: fixedStack:
+; CHECK-DAG: id: [[Z0_ID:[0-9]+]], offset: 0, size: 8
+; CHECK-DAG: id: [[Z1_ID:[0-9]+]], offset: 8, size: 8
+; CHECK-DAG: id: [[Z2_ID:[0-9]+]], offset: 16, size: 8
+; CHECK-DAG: id: [[Z3_ID:[0-9]+]], offset: 24, size: 8
+; CHECK: liveins: %d0, %d1, %d2, %s6, %s7, %s8
+; CHECK: [[X0:%[0-9]+]](s64) = COPY %d0
+; CHECK: [[X1:%[0-9]+]](s64) = COPY %d1
+; CHECK: [[X2:%[0-9]+]](s64) = COPY %d2
+; CHECK: [[Y0:%[0-9]+]](s32) = COPY %s6
+; CHECK: [[Y1:%[0-9]+]](s32) = COPY %s7
+; CHECK: [[Y2:%[0-9]+]](s32) = COPY %s8
+; CHECK: [[Z0_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Z0_ID]]
+; CHECK: [[Z0:%[0-9]+]](s64) = G_LOAD [[Z0_FI]]{{.*}}load 8
+; CHECK: [[Z1_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Z1_ID]]
+; CHECK: [[Z1:%[0-9]+]](s64) = G_LOAD [[Z1_FI]]{{.*}}load 8
+; CHECK: [[Z2_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Z2_ID]]
+; CHECK: [[Z2:%[0-9]+]](s64) = G_LOAD [[Z2_FI]]{{.*}}load 8
+; CHECK: [[Z3_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Z3_ID]]
+; CHECK: [[Z3:%[0-9]+]](s64) = G_LOAD [[Z3_FI]]{{.*}}load 8
+; CHECK: [[X_ARR_0:%[0-9]+]](s192) = IMPLICIT_DEF
+; CHECK: [[X_ARR_1:%[0-9]+]](s192) = G_INSERT [[X_ARR_0]], [[X0]](s64), 0
+; CHECK: [[X_ARR_2:%[0-9]+]](s192) = G_INSERT [[X_ARR_1]], [[X1]](s64), 64
+; CHECK: [[X_ARR_3:%[0-9]+]](s192) = G_INSERT [[X_ARR_2]], [[X2]](s64), 128
+; CHECK: [[X_ARR:%[0-9]+]](s192) = COPY [[X_ARR_3]](s192)
+; CHECK: [[Y_ARR_0:%[0-9]+]](s96) = IMPLICIT_DEF
+; CHECK: [[Y_ARR_1:%[0-9]+]](s96) = G_INSERT [[Y_ARR_0]], [[Y0]](s32), 0
+; CHECK: [[Y_ARR_2:%[0-9]+]](s96) = G_INSERT [[Y_ARR_1]], [[Y1]](s32), 32
+; CHECK: [[Y_ARR_3:%[0-9]+]](s96) = G_INSERT [[Y_ARR_2]], [[Y2]](s32), 64
+; CHECK: [[Y_ARR:%[0-9]+]](s96) = COPY [[Y_ARR_3]](s96)
+; CHECK: [[Z_ARR_0:%[0-9]+]](s256) = IMPLICIT_DEF
+; CHECK: [[Z_ARR_1:%[0-9]+]](s256) = G_INSERT [[Z_ARR_0]], [[Z0]](s64), 0
+; CHECK: [[Z_ARR_2:%[0-9]+]](s256) = G_INSERT [[Z_ARR_1]], [[Z1]](s64), 64
+; CHECK: [[Z_ARR_3:%[0-9]+]](s256) = G_INSERT [[Z_ARR_2]], [[Z2]](s64), 128
+; CHECK: [[Z_ARR_4:%[0-9]+]](s256) = G_INSERT [[Z_ARR_3]], [[Z3]](s64), 192
+; CHECK: [[Z_ARR:%[0-9]+]](s256) = COPY [[Z_ARR_4]](s256)
+; CHECK: ADJCALLSTACKDOWN 32, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[X0:%[0-9]+]](s64) = G_EXTRACT [[X_ARR]](s192), 0
+; CHECK: [[X1:%[0-9]+]](s64) = G_EXTRACT [[X_ARR]](s192), 64
+; CHECK: [[X2:%[0-9]+]](s64) = G_EXTRACT [[X_ARR]](s192), 128
+; CHECK: [[Y0:%[0-9]+]](s32) = G_EXTRACT [[Y_ARR]](s96), 0
+; CHECK: [[Y1:%[0-9]+]](s32) = G_EXTRACT [[Y_ARR]](s96), 32
+; CHECK: [[Y2:%[0-9]+]](s32) = G_EXTRACT [[Y_ARR]](s96), 64
+; CHECK: [[Z0:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 0
+; CHECK: [[Z1:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 64
+; CHECK: [[Z2:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 128
+; CHECK: [[Z3:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 192
+; CHECK: %d0 = COPY [[X0]](s64)
+; CHECK: %d1 = COPY [[X1]](s64)
+; CHECK: %d2 = COPY [[X2]](s64)
+; CHECK: %s6 = COPY [[Y0]](s32)
+; CHECK: %s7 = COPY [[Y1]](s32)
+; CHECK: %s8 = COPY [[Y2]](s32)
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[Z0_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[Z0_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Z0_OFFSET]](s32)
+; CHECK: G_STORE [[Z0]](s64), [[Z0_ADDR]](p0){{.*}}store 8
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[Z1_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 8
+; CHECK: [[Z1_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Z1_OFFSET]](s32)
+; CHECK: G_STORE [[Z1]](s64), [[Z1_ADDR]](p0){{.*}}store 8
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[Z2_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 16
+; CHECK: [[Z2_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Z2_OFFSET]](s32)
+; CHECK: G_STORE [[Z2]](s64), [[Z2_ADDR]](p0){{.*}}store 8
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[Z3_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 24
+; CHECK: [[Z3_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Z3_OFFSET]](s32)
+; CHECK: G_STORE [[Z3]](s64), [[Z3_ADDR]](p0){{.*}}store 8
+; CHECK: BLX @fp_arrays_aapcs_vfp_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %d0, implicit %d1, implicit %d2, implicit %s6, implicit %s7, implicit %s8, implicit-def %s0, implicit-def %s1, implicit-def %s2, implicit-def %s3
+; CHECK: [[R0:%[0-9]+]](s32) = COPY %s0
+; CHECK: [[R1:%[0-9]+]](s32) = COPY %s1
+; CHECK: [[R2:%[0-9]+]](s32) = COPY %s2
+; CHECK: [[R3:%[0-9]+]](s32) = COPY %s3
+; CHECK: [[R_MERGED_0:%[0-9]+]](s128) = IMPLICIT_DEF
+; CHECK: [[R_MERGED_1:%[0-9]+]](s128) = G_INSERT [[R_MERGED_0]], [[R0]](s32), 0
+; CHECK: [[R_MERGED_2:%[0-9]+]](s128) = G_INSERT [[R_MERGED_1]], [[R1]](s32), 32
+; CHECK: [[R_MERGED_3:%[0-9]+]](s128) = G_INSERT [[R_MERGED_2]], [[R2]](s32), 64
+; CHECK: [[R_MERGED_4:%[0-9]+]](s128) = G_INSERT [[R_MERGED_3]], [[R3]](s32), 96
+; CHECK: [[R_MERGED:%[0-9]+]](s128) = COPY [[R_MERGED_4]]
+; CHECK: ADJCALLSTACKUP 32, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 32
+; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 64
+; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 96
+; CHECK: %s0 = COPY [[R0]]
+; CHECK: %s1 = COPY [[R1]]
+; CHECK: %s2 = COPY [[R2]]
+; CHECK: %s3 = COPY [[R3]]
+; CHECK: BX_RET 14, _, implicit %s0, implicit %s1, implicit %s2, implicit %s3
+entry:
+  %r = notail call arm_aapcs_vfpcc [4 x float] @fp_arrays_aapcs_vfp_target([3 x double] %x, [3 x float] %y, [4 x double] %z)
+  ret [4 x float] %r
+}
+
+declare arm_aapcscc [2 x i32*] @tough_arrays_target([6 x [4 x i32]] %arr)
+
+define arm_aapcscc [2 x i32*] @test_tough_arrays([6 x [4 x i32]] %arr) {
+; CHECK-LABEL: name: test_tough_arrays
+; CHECK: fixedStack:
+; The parameters live in separate stack locations, one for each element that
+; doesn't fit in the registers.
+; CHECK-DAG: id: [[FIRST_STACK_ID:[0-9]+]], offset: 0, size: 4
+; CHECK-DAG: id: [[LAST_STACK_ID:[-0]+]], offset: 76, size: 4
+; CHECK: liveins: %r0, %r1, %r2, %r3
+; CHECK-DAG: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK-DAG: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK-DAG: [[R2:%[0-9]+]](s32) = COPY %r2
+; CHECK-DAG: [[R3:%[0-9]+]](s32) = COPY %r3
+; CHECK: [[FIRST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[FIRST_STACK_ID]]
+; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[FIRST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[FIRST_STACK_ID]]
+; CHECK: [[LAST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[LAST_STACK_ID]]
+; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[LAST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[LAST_STACK_ID]]
+; CHECK: [[ARG_ARR0:%[0-9]+]](s768) = IMPLICIT_DEF
+; CHECK: [[ARG_ARR1:%[0-9]+]](s768) = G_INSERT [[ARG_ARR0]], [[R0]](s32), 0
+; CHECK: [[ARG_ARR2:%[0-9]+]](s768) = G_INSERT [[ARG_ARR1]], [[R1]](s32), 32
+; CHECK: [[ARG_ARR3:%[0-9]+]](s768) = G_INSERT [[ARG_ARR2]], [[R2]](s32), 64
+; CHECK: [[ARG_ARR4:%[0-9]+]](s768) = G_INSERT [[ARG_ARR3]], [[R3]](s32), 96
+; CHECK: [[ARG_ARR5:%[0-9]+]](s768) = G_INSERT [[ARG_ARR4]], [[FIRST_STACK_ELEMENT]](s32), 128
+; CHECK: [[ARG_ARR6:%[0-9]+]](s768) = G_INSERT {{%[0-9]+}}, [[LAST_STACK_ELEMENT]](s32), 736
+; CHECK: [[ARG_ARR:%[0-9]+]](s768) = COPY [[ARG_ARR6]]
+; CHECK: ADJCALLSTACKDOWN 80, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 32
+; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 64
+; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 96
+; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 128
+; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 736
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: %r2 = COPY [[R2]]
+; CHECK: %r3 = COPY [[R3]]
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF_FIRST_ELEMENT:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[FIRST_STACK_ARG_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[OFF_FIRST_ELEMENT]](s32)
+; CHECK: G_STORE [[FIRST_STACK_ELEMENT]](s32), [[FIRST_STACK_ARG_ADDR]]{{.*}}store 4
+; Match the second-to-last offset, so we can get the correct SP for the last element
+; CHECK: G_CONSTANT i32 72
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF_LAST_ELEMENT:%[0-9]+]](s32) = G_CONSTANT i32 76
+; CHECK: [[LAST_STACK_ARG_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[OFF_LAST_ELEMENT]](s32)
+; CHECK: G_STORE [[LAST_STACK_ELEMENT]](s32), [[LAST_STACK_ARG_ADDR]]{{.*}}store 4
+; CHECK: BLX @tough_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
+; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK: [[RES_ARR0:%[0-9]+]](s64) = IMPLICIT_DEF
+; CHECK: [[RES_ARR1:%[0-9]+]](s64) = G_INSERT [[RES_ARR0]], [[R0]](s32), 0
+; CHECK: [[RES_ARR2:%[0-9]+]](s64) = G_INSERT [[RES_ARR1]], [[R1]](s32), 32
+; CHECK: [[RES_ARR:%[0-9]+]](s64) = COPY [[RES_ARR2]]
+; CHECK: ADJCALLSTACKUP 80, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s64), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s64), 32
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: BX_RET 14, _, implicit %r0, implicit %r1
+entry:
+  %r = notail call arm_aapcscc [2 x i32*] @tough_arrays_target([6 x [4 x i32]] %arr)
+  ret [2 x i32*] %r
+}
+
 define i32 @test_shufflevector_s32_v2s32(i32 %arg) {
 ; CHECK-LABEL: name: test_shufflevector_s32_v2s32
 ; CHECK: [[ARG:%[0-9]+]](s32) = COPY %r0
diff --git a/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
index e3680ed2b929..ef30cb1063f8 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
@@ -45,11 +45,13 @@ define half @test_half(half %a, half %b) {
   ret half %res
 }
 
-; On ARM, clang lowers structs to arrays.
-define void @test_arrays([2 x i32] %this.could.come.from.a.struct) {
-; CHECK: remark: {{.*}} unable to lower arguments: void ([2 x i32])*
-; CHECK-LABEL: warning: Instruction selection used fallback path for test_arrays
-  ret void
+declare [16 x i32] @ret_demotion_target()
+
+define [16 x i32] @test_ret_demotion() {
+; CHECK: remark: {{.*}} unable to translate instruction: call{{.*}} @ret_demotion_target
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_ret_demotion
+  %res = call [16 x i32] @ret_demotion_target()
+  ret [16 x i32] %res
 }
 
 define void @test_structs({i32, i32} %struct) {
diff --git a/test/CodeGen/ARM/arm-shrink-wrapping.ll b/test/CodeGen/ARM/arm-shrink-wrapping.ll
index 9cce19417047..1985ff9b4a27 100644
--- a/test/CodeGen/ARM/arm-shrink-wrapping.ll
+++ b/test/CodeGen/ARM/arm-shrink-wrapping.ll
@@ -656,6 +656,9 @@ declare double @llvm.pow.f64(double, double)
 ;
 ; DISABLE: pop
 ;
+; FIXME: This is flakey passing by finding 'bl' somewhere amongst the debug
+; info (like labels named 'line_table) not because it's found a bl instruction.
+;
 ; CHECK: bl
 define float @debug_info(float %gamma, float %slopeLimit, i1 %or.cond, double %tmp) "no-frame-pointer-elim"="true" {
 bb:
@@ -681,7 +684,9 @@ bb13:                                             ; preds = %bb3, %bb
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "LLVM", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "LLVM", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !4, globals: !2, imports: !2)
 !1 = !DIFile(filename: "a.cpp", directory: "b")
 !2 = !{}
 !3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{!5}
+!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/ARM/constantpool-promote-dbg.ll b/test/CodeGen/ARM/constantpool-promote-dbg.ll
index ae765d26dcac..84386d2975f0 100644
--- a/test/CodeGen/ARM/constantpool-promote-dbg.ll
+++ b/test/CodeGen/ARM/constantpool-promote-dbg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -relocation-model=static < %s | FileCheck %s
+; RUN: llc -relocation-model=static -arm-promote-constant < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv7m--linux-gnu"
diff --git a/test/CodeGen/ARM/constantpool-promote-ldrh.ll b/test/CodeGen/ARM/constantpool-promote-ldrh.ll
index 9e369dc08c4b..59970495874b 100644
--- a/test/CodeGen/ARM/constantpool-promote-ldrh.ll
+++ b/test/CodeGen/ARM/constantpool-promote-ldrh.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -O0 -fast-isel=false | FileCheck %s
-; RUN: llc < %s -O0 -fast-isel=false -filetype=obj
+; RUN: llc < %s -O0 -fast-isel=false -arm-promote-constant | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel=false -filetype=obj -arm-promote-constant
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-arm-linux-gnueabi"
 
diff --git a/test/CodeGen/ARM/constantpool-promote.ll b/test/CodeGen/ARM/constantpool-promote.ll
index 8df7e100c051..d5361f33a98b 100644
--- a/test/CodeGen/ARM/constantpool-promote.ll
+++ b/test/CodeGen/ARM/constantpool-promote.ll
@@ -1,15 +1,15 @@
-; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
-; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
-; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
-; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
-; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
-; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
-; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
-; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
-; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
-; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
-; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
-; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
+; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=static -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
+; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=pic -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
+; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=ropi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
+; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=rwpi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
+; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=static -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
+; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=pic -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
+; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=ropi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
+; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=rwpi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
+; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=static -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
+; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=pic -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
+; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=ropi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
+; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=rwpi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
 
 @.str = private unnamed_addr constant [2 x i8] c"s\00", align 1
 @.str1 = private unnamed_addr constant [69 x i8] c"this string is far too long to fit in a literal pool by far and away\00", align 1
diff --git a/test/CodeGen/ARM/cortexr52-misched-basic.ll b/test/CodeGen/ARM/cortexr52-misched-basic.ll
index 3ccb34d9fc90..eb2c29a3a5d1 100644
--- a/test/CodeGen/ARM/cortexr52-misched-basic.ll
+++ b/test/CodeGen/ARM/cortexr52-misched-basic.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-r52 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=R52_SCHED
-; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic    -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-r52 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=R52_SCHED
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic    -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
 ;
 ; Check the latency for instructions for both generic and cortex-r52.
 ; Cortex-r52 machine model will cause the div to be sceduled before eor
diff --git a/test/CodeGen/ARM/fastisel-thumb-litpool.ll b/test/CodeGen/ARM/fastisel-thumb-litpool.ll
index aa9e7260fb2e..53653a5a4f57 100644
--- a/test/CodeGen/ARM/fastisel-thumb-litpool.ll
+++ b/test/CodeGen/ARM/fastisel-thumb-litpool.ll
@@ -5,6 +5,7 @@
 ; hence the CHECK-NOT.
 
 define i32 @test_thumb_ldrlit() minsize {
+; CHECK-LABEL: test_thumb_ldrlit:
 ; CHECK: ldr r0, LCPI0_0
 ; CHECK-NOT: ldr
   ret i32 12345678
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index fb204debf612..b447497b270a 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -35,6 +35,8 @@ entry:
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]]
 ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; CHECK-T1-LABEL: t1:
+; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false)
   ret void
 }
@@ -51,6 +53,8 @@ entry:
 ; CHECK: str [[REG2]], [r0]
 ; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r3]
+; CHECK-T1-LABEL: t2:
+; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
   ret void
 }
@@ -62,6 +66,8 @@ entry:
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
 ; CHECK: vldr d{{[0-9]+}}, [r1]
 ; CHECK: vst1.8 {d{{[0-9]+}}}, [r0]
+; CHECK-T1-LABEL: t3:
+; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
   ret void
 }
@@ -72,6 +78,8 @@ entry:
 ; CHECK: vld1.64 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1]
 ; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0]!
 ; CHECK: strh [[REG5:r[0-9]+]], [r0]
+; CHECK-T1-LABEL: t4:
+; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
   ret void
 }
@@ -87,10 +95,7 @@ entry:
 ; CHECK: movt [[REG7:r[0-9]+]], #22866
 ; CHECK: str [[REG7]]
 ; CHECK-T1-LABEL: t5:
-; CHECK-T1: movs [[TREG3:r[0-9]]],
-; CHECK-T1: strb [[TREG3]],
-; CHECK-T1: movs [[TREG4:r[0-9]]],
-; CHECK-T1: strb [[TREG4]],
+; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
   ret void
 }
diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll
index b86874692aca..b2bd257701d3 100644
--- a/test/CodeGen/ARM/memset-inline.ll
+++ b/test/CodeGen/ARM/memset-inline.ll
@@ -1,22 +1,36 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s -check-prefix=CHECK-7A
+; RUN: llc < %s -mtriple=thumbv6m -pre-RA-sched=source -disable-post-ra -mattr=+strict-align | FileCheck %s -check-prefix=CHECK-6M
 
 define void @t1(i8* nocapture %c) nounwind optsize {
 entry:
-; CHECK-LABEL: t1:
-; CHECK: movs r1, #0
-; CHECK: strd r1, r1, [r0]
-; CHECK: str r1, [r0, #8]
+; CHECK-7A-LABEL: t1:
+; CHECK-7A: movs r1, #0
+; CHECK-7A: strd r1, r1, [r0]
+; CHECK-7A: str r1, [r0, #8]
+; CHECK-6M-LABEL: t1:
+; CHECK-6M: movs r1, #0
+; CHECK-6M: str r1, [r0]
+; CHECK-6M: str r1, [r0, #4]
+; CHECK-6M: str r1, [r0, #8]
   call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
   ret void
 }
 
 define void @t2() nounwind ssp {
 entry:
-; CHECK-LABEL: t2:
-; CHECK: vmov.i32 {{q[0-9]+}}, #0x0
-; CHECK: movs r1, #10
-; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2], r1
-; CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]
+; CHECK-7A-LABEL: t2:
+; CHECK-7A: vmov.i32 {{q[0-9]+}}, #0x0
+; CHECK-7A: movs r1, #10
+; CHECK-7A: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2], r1
+; CHECK-7A: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]
+; CHECK-6M-LABEL: t2:
+; CHECK-6M: movs [[REG:r[0-9]+]], #0
+; CHECK-6M: str  [[REG]], [sp, #20]
+; CHECK-6M: str  [[REG]], [sp, #16]
+; CHECK-6M: str  [[REG]], [sp, #12]
+; CHECK-6M: str  [[REG]], [sp, #8]
+; CHECK-6M: str  [[REG]], [sp, #4]
+; CHECK-6M: str  [[REG]], [sp]
   %buf = alloca [26 x i8], align 1
   %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0
   call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
@@ -24,6 +38,56 @@ entry:
   ret void
 }
 
+define void @t3(i8* %p) {
+entry:
+; CHECK-7A-LABEL: t3:
+; CHECK-7A: muls [[REG:r[0-9]+]],
+; CHECK-7A: str  [[REG]],
+; CHECK-6M-LABEL: t3:
+; CHECK-6M-NOT: muls
+; CHECK-6M: strb [[REG:r[0-9]+]],
+; CHECK-6M: strb [[REG]],
+; CHECK-6M: strb [[REG]],
+; CHECK-6M: strb [[REG]],
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = trunc i32 %i to i8
+  call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 1, i1 false)
+  call void @something(i8* %p)
+  %inc = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, 255
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @t4(i8* %p) {
+entry:
+; CHECK-7A-LABEL: t4:
+; CHECK-7A: muls [[REG:r[0-9]+]],
+; CHECK-7A: str  [[REG]],
+; CHECK-6M-LABEL: t4:
+; CHECK-6M: muls [[REG:r[0-9]+]],
+; CHECK-6M: strh [[REG]],
+; CHECK-6M: strh [[REG]],
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = trunc i32 %i to i8
+  call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 2, i1 false)
+  call void @something(i8* %p)
+  %inc = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, 255
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 declare void @something(i8*) nounwind
 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM/misched-copy-arm.ll b/test/CodeGen/ARM/misched-copy-arm.ll
index 330252a90d7c..53f8b8d15042 100644
--- a/test/CodeGen/ARM/misched-copy-arm.ll
+++ b/test/CodeGen/ARM/misched-copy-arm.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=misched -arm-atomic-cfg-tidy=0 %s -o - 2>&1 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=machine-scheduler -arm-atomic-cfg-tidy=0 %s -o - 2>&1 | FileCheck %s
 ;
 ; Loop counter copies should be eliminated.
 ; There is also a MUL here, but we don't care where it is scheduled.
diff --git a/test/CodeGen/ARM/misched-fp-basic.ll b/test/CodeGen/ARM/misched-fp-basic.ll
index 27ad2cec34fd..2f672b0cb540 100644
--- a/test/CodeGen/ARM/misched-fp-basic.ll
+++ b/test/CodeGen/ARM/misched-fp-basic.ll
@@ -1,9 +1,9 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a9 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > \
+; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a9 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > \
 ; RUN:   /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=swift -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > \
+; RUN: llc < %s -mtriple=arm-eabi -mcpu=swift -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > \
 ; RUN:   /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-r52 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > \
+; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-r52 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > \
 ; RUN:   /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52
 ;
 ; Check the latency of instructions for processors with sched-models
diff --git a/test/CodeGen/ARM/misched-int-basic-thumb2.mir b/test/CodeGen/ARM/misched-int-basic-thumb2.mir
index 86ef1e26f636..32d1e03d9a1b 100644
--- a/test/CodeGen/ARM/misched-int-basic-thumb2.mir
+++ b/test/CodeGen/ARM/misched-int-basic-thumb2.mir
@@ -1,10 +1,10 @@
 # Basic machine sched model test for Thumb2 int instructions
 # RUN: llc -o /dev/null %s -mtriple=thumbv7-eabi -mcpu=swift -run-pass  machine-scheduler  -enable-misched -verify-misched \
-# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT
+# RUN:  -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT
 # RUN: llc -o /dev/null %s -mtriple=thumbv7--eabi -mcpu=cortex-a9 -run-pass  machine-scheduler  -enable-misched -verify-misched \
-# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9
+# RUN:  -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9
 # RUN: llc -o /dev/null %s -mtriple=thumbv8r-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -verify-misched \
-# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52
+# RUN:  -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52
 # REQUIRES: asserts
 --- |
   ; ModuleID = 'foo.ll'
diff --git a/test/CodeGen/ARM/misched-int-basic.mir b/test/CodeGen/ARM/misched-int-basic.mir
index f237c0a07b2e..d5231269d732 100644
--- a/test/CodeGen/ARM/misched-int-basic.mir
+++ b/test/CodeGen/ARM/misched-int-basic.mir
@@ -1,9 +1,9 @@
 # RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=swift -run-pass  machine-scheduler  -enable-misched -verify-misched \
-# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT
+# RUN:  -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT
 # RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-a9 -run-pass  machine-scheduler  -enable-misched -verify-misched \
-# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9
+# RUN:  -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9
 # RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -verify-misched \
-# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52
+# RUN:  -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52
 # REQUIRES: asserts
 --- |
   ; ModuleID = 'foo.ll'
diff --git a/test/CodeGen/ARM/single-issue-r52.mir b/test/CodeGen/ARM/single-issue-r52.mir
index 6c95f7603e6e..1eba074dafb3 100644
--- a/test/CodeGen/ARM/single-issue-r52.mir
+++ b/test/CodeGen/ARM/single-issue-r52.mir
@@ -1,5 +1,5 @@
-# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=misched -misched-topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN
-# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=misched -misched-bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=machine-scheduler -misched-topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=machine-scheduler -misched-bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP
 # REQUIRES: asserts
 --- |
   ; ModuleID = 'foo.ll'
diff --git a/test/CodeGen/ARM/vcombine.ll b/test/CodeGen/ARM/vcombine.ll
index 81b22ee12cdd..c08ed81d042a 100644
--- a/test/CodeGen/ARM/vcombine.ll
+++ b/test/CodeGen/ARM/vcombine.ll
@@ -99,7 +99,9 @@ define <4 x i16> @vget_low16(<8 x i16>* %A) nounwind {
 define <8 x i8> @vget_high8(<16 x i8>* %A) nounwind {
 ; CHECK: vget_high8
 ; CHECK-NOT: vst
-; CHECK-LE: vmov r0, r1, d17
+; CHECK-LE-NOT: vld1.64 {d16, d17}, [r0]
+; CHECK-LE: vldr  d16, [r0, #8]
+; CHECK-LE: vmov  r0, r1, d16
 ; CHECK-BE: vmov r1, r0, d16
 	%tmp1 = load <16 x i8>, <16 x i8>* %A
         %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll
index e44e757a3169..5742dc314978 100644
--- a/test/CodeGen/ARM/vext.ll
+++ b/test/CodeGen/ARM/vext.ll
@@ -199,10 +199,10 @@ define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ; CHECK-LABEL: test_undef:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
-; CHECK-NEXT:    vzip.16 d19, d16
-; CHECK-NEXT:    vmov r0, r1, d19
+; CHECK-NEXT:    vldr  d16, [r1]
+; CHECK-NEXT:    vldr  d17, [r0, #8]
+; CHECK-NEXT:    vzip.16 d17, d16
+; CHECK-NEXT:    vmov  r0, r1, d17
 ; CHECK-NEXT:    mov pc, lr
         %tmp1 = load <8 x i16>, <8 x i16>* %A
         %tmp2 = load <8 x i16>, <8 x i16>* %B
diff --git a/test/CodeGen/Hexagon/post-ra-kill-update.mir b/test/CodeGen/Hexagon/post-ra-kill-update.mir
index c43624d7a8d3..ac46a70a68a5 100644
--- a/test/CodeGen/Hexagon/post-ra-kill-update.mir
+++ b/test/CodeGen/Hexagon/post-ra-kill-update.mir
@@ -6,7 +6,7 @@
 
 # CHECK-LABEL: name: foo
 # Check for no-kill of r9 in the first instruction, after reordering:
-# CHECK: %d7 = S2_lsr_r_p_or %d7, killed %d1, %r9
+# CHECK: %d7 = S2_lsr_r_p_or killed %d7, killed %d1, %r9
 # CHECK: %d13 = S2_lsr_r_p killed %d0, killed %r9
 
 --- |
diff --git a/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll b/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll
index 18cca5c356e3..242ee53f19f2 100644
--- a/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll
+++ b/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc %s -mtriple=lanai-unknown-unknown -debug-only=misched -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc %s -mtriple=lanai-unknown-unknown -debug-only=machine-scheduler -o /dev/null 2>&1 | FileCheck %s
 
 ; Make sure there are no control dependencies between memory operations that
 ; are trivially disjoint.
diff --git a/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir b/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir
deleted file mode 100644
index 96801f5b0a37..000000000000
--- a/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir
+++ /dev/null
@@ -1,24 +0,0 @@
-# RUN: llc -mtriple=arm-apple-ios -run-pass=if-converter %s -o - | FileCheck %s
----
-name:            foo
-body:             |
-  bb.0:
-    B %bb.2
-
-  bb.1:
-    BX_RET 14, 0
-
-  bb.2:
-    Bcc %bb.1, 1, %cpsr
-
-  bb.3:
-    B %bb.1
-
-...
-
-# We should get a single block containing the BX_RET, with no successors at all
-
-# CHECK:      body:
-# CHECK-NEXT:   bb.0:
-# CHECK-NEXT:     BX_RET
-
diff --git a/test/CodeGen/MSP430/hwmult16.ll b/test/CodeGen/MSP430/hwmult16.ll
index b23f1ad37d81..87b6a7aeacf5 100644
--- a/test/CodeGen/MSP430/hwmult16.ll
+++ b/test/CodeGen/MSP430/hwmult16.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mhwmult=16bit < %s | FileCheck %s
+; RUN: llc -O0 -mattr=+hwmult16 < %s | FileCheck %s
 
 target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
 target triple = "msp430---elf"
diff --git a/test/CodeGen/MSP430/hwmult32.ll b/test/CodeGen/MSP430/hwmult32.ll
index 6ffeb9698862..10c831e77ffb 100644
--- a/test/CodeGen/MSP430/hwmult32.ll
+++ b/test/CodeGen/MSP430/hwmult32.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mhwmult=32bit < %s | FileCheck %s
+; RUN: llc -O0 -mattr=+hwmult32 < %s | FileCheck %s
 
 target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
 target triple = "msp430---elf"
diff --git a/test/CodeGen/MSP430/hwmultf5.ll b/test/CodeGen/MSP430/hwmultf5.ll
index 51ca4be4a654..c57922ece7d0 100644
--- a/test/CodeGen/MSP430/hwmultf5.ll
+++ b/test/CodeGen/MSP430/hwmultf5.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mhwmult=f5series < %s | FileCheck %s
+; RUN: llc -O0 -mattr=+hwmultf5 < %s | FileCheck %s
 
 target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
 target triple = "msp430---elf"
diff --git a/test/CodeGen/MSP430/vararg.ll b/test/CodeGen/MSP430/vararg.ll
index a708b89cbd8f..4baf499848fd 100644
--- a/test/CodeGen/MSP430/vararg.ll
+++ b/test/CodeGen/MSP430/vararg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
 
 target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
 target triple = "msp430---elf"
diff --git a/test/CodeGen/Nios2/lit.local.cfg b/test/CodeGen/Nios2/lit.local.cfg
new file mode 100644
index 000000000000..84c8b039391b
--- /dev/null
+++ b/test/CodeGen/Nios2/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'Nios2' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/CodeGen/Nios2/target_support.ll b/test/CodeGen/Nios2/target_support.ll
new file mode 100644
index 000000000000..90e7020b2fcc
--- /dev/null
+++ b/test/CodeGen/Nios2/target_support.ll
@@ -0,0 +1,11 @@
+; This tests that llc accepts Nios2 target.
+
+; RUN: not not llc < %s -asm-verbose=false -march=nios2 2>&1 | FileCheck %s --check-prefix=ARCH
+; RUN: not not llc < %s -asm-verbose=false -mtriple=nios2 2>&1 | FileCheck %s --check-prefix=TRIPLE
+
+; ARCH-NOT: invalid target
+; TRIPLE-NOT: unable to get target
+
+define i32 @f(i32 %i) {
+  ret i32 %i
+}
diff --git a/test/CodeGen/PowerPC/atomics-constant.ll b/test/CodeGen/PowerPC/atomics-constant.ll
new file mode 100644
index 000000000000..a92ca813af85
--- /dev/null
+++ b/test/CodeGen/PowerPC/atomics-constant.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
+
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@a = constant i64 zeroinitializer
+
+define i64 @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LC0@toc@ha
+; CHECK-NEXT:    li 4, 0
+; CHECK-NEXT:    ld 3, .LC0@toc@l(3)
+; CHECK-NEXT:    cmpw 7, 4, 4
+; CHECK-NEXT:    ld 3, 0(3)
+; CHECK-NEXT:    bne- 7, .+4
+; CHECK-NEXT:    isync
+; CHECK-NEXT:    li 3, 0
+; CHECK-NEXT:    blr
+entry:
+  %value = load atomic i64, i64* @a acquire, align 8
+  ret i64 %value
+}
diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll
index 1bce9d4cb439..c42f677d17ab 100644
--- a/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -869,9 +869,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsi
 ; P8BE-LABEL: fromDiffConstsi
 ; P8LE-LABEL: fromDiffConstsi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -899,9 +899,9 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAi
 ; P8BE-LABEL: fromDiffMemConsAi
 ; P8LE-LABEL: fromDiffMemConsAi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -929,12 +929,12 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDi
 ; P8BE-LABEL: fromDiffMemConsDi
 ; P8LE-LABEL: fromDiffMemConsDi
-; P9BE: lxvx
-; P9BE: lxvx
+; P9BE: lxv
+; P9BE: lxv
 ; P9BE: vperm
 ; P9BE: blr
-; P9LE: lxvx
-; P9LE: lxvx
+; P9LE: lxv
+; P9LE: lxv
 ; P9LE: vperm
 ; P9LE: blr
 ; P8BE: lxvw4x
@@ -1018,13 +1018,13 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDi
 ; P8LE-LABEL: fromDiffMemVarDi
 ; P9BE: sldi {{r[0-9]+}}, r4, 2
-; P9BE-DAG: lxvx {{v[0-9]+}}, r3,
-; P9BE-DAG: lxvx
+; P9BE-DAG: lxv {{v[0-9]+}}
+; P9BE-DAG: lxv
 ; P9BE: vperm
 ; P9BE: blr
 ; P9LE: sldi {{r[0-9]+}}, r4, 2
-; P9LE-DAG: lxvx {{v[0-9]+}}, r3,
-; P9LE-DAG: lxvx
+; P9LE-DAG: lxv {{v[0-9]+}}
+; P9LE-DAG: lxv
 ; P9LE: vperm
 ; P9LE: blr
 ; P8BE: sldi {{r[0-9]+}}, r4, 2
@@ -1281,9 +1281,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvftoi
 ; P8BE-LABEL: fromDiffConstsConvftoi
 ; P8LE-LABEL: fromDiffConstsConvftoi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -1303,10 +1303,10 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvftoi
 ; P8BE-LABEL: fromDiffMemConsAConvftoi
 ; P8LE-LABEL: fromDiffMemConsAConvftoi
-; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3)
 ; P9BE: xvcvspsxws v2, [[REG1]]
 ; P9BE: blr
-; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3)
 ; P9LE: xvcvspsxws v2, [[REG1]]
 ; P9LE: blr
 ; P8BE: lxvw4x [[REG1:[vs0-9]+]], 0, r3
@@ -1341,13 +1341,13 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDConvftoi
 ; P8BE-LABEL: fromDiffMemConsDConvftoi
 ; P8LE-LABEL: fromDiffMemConsDConvftoi
-; P9BE: lxvx
-; P9BE: lxvx
+; P9BE: lxv
+; P9BE: lxv
 ; P9BE: vperm
 ; P9BE: xvcvspsxws
 ; P9BE: blr
-; P9LE: lxvx
-; P9LE: lxvx
+; P9LE: lxv
+; P9LE: lxv
 ; P9LE: vperm
 ; P9LE: xvcvspsxws
 ; P9LE: blr
@@ -1557,9 +1557,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvdtoi
 ; P8BE-LABEL: fromDiffConstsConvdtoi
 ; P8LE-LABEL: fromDiffConstsConvdtoi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -1584,16 +1584,16 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvdtoi
 ; P8BE-LABEL: fromDiffMemConsAConvdtoi
 ; P8LE-LABEL: fromDiffMemConsAConvdtoi
-; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3
-; P9BE: lxvx [[REG2:[vs0-9]+]], r3, r4
+; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9BE: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
 ; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
 ; P9BE: vmrgew v2, [[REG6]], [[REG5]]
 ; P9BE: xvcvspsxws v2, v2
-; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3
-; P9LE: lxvx [[REG2:[vs0-9]+]], r3, r4
+; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9LE: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
@@ -2027,9 +2027,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsui
 ; P8BE-LABEL: fromDiffConstsui
 ; P8LE-LABEL: fromDiffConstsui
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -2057,9 +2057,9 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAui
 ; P8BE-LABEL: fromDiffMemConsAui
 ; P8LE-LABEL: fromDiffMemConsAui
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -2087,12 +2087,12 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDui
 ; P8BE-LABEL: fromDiffMemConsDui
 ; P8LE-LABEL: fromDiffMemConsDui
-; P9BE: lxvx
-; P9BE: lxvx
+; P9BE: lxv
+; P9BE: lxv
 ; P9BE: vperm
 ; P9BE: blr
-; P9LE: lxvx
-; P9LE: lxvx
+; P9LE: lxv
+; P9LE: lxv
 ; P9LE: vperm
 ; P9LE: blr
 ; P8BE: lxvw4x
@@ -2177,13 +2177,13 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDui
 ; P8LE-LABEL: fromDiffMemVarDui
 ; P9BE-DAG: sldi {{r[0-9]+}}, r4, 2
-; P9BE-DAG: lxvx {{v[0-9]+}}, r3
-; P9BE-DAG: lxvx
+; P9BE-DAG: lxv {{v[0-9]+}}, -12(r3)
+; P9BE-DAG: lxv
 ; P9BE: vperm
 ; P9BE: blr
 ; P9LE-DAG: sldi {{r[0-9]+}}, r4, 2
-; P9LE-DAG: lxvx {{v[0-9]+}}, r3
-; P9LE-DAG: lxvx
+; P9LE-DAG: lxv {{v[0-9]+}}, -12(r3)
+; P9LE-DAG: lxv
 ; P9LE: vperm
 ; P9LE: blr
 ; P8BE-DAG: sldi {{r[0-9]+}}, r4, 2
@@ -2439,9 +2439,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvftoui
 ; P8BE-LABEL: fromDiffConstsConvftoui
 ; P8LE-LABEL: fromDiffConstsConvftoui
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -2461,10 +2461,10 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvftoui
 ; P8BE-LABEL: fromDiffMemConsAConvftoui
 ; P8LE-LABEL: fromDiffMemConsAConvftoui
-; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3)
 ; P9BE: xvcvspuxws v2, [[REG1]]
 ; P9BE: blr
-; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3)
 ; P9LE: xvcvspuxws v2, [[REG1]]
 ; P9LE: blr
 ; P8BE: lxvw4x [[REG1:[vs0-9]+]], 0, r3
@@ -2499,13 +2499,13 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDConvftoui
 ; P8BE-LABEL: fromDiffMemConsDConvftoui
 ; P8LE-LABEL: fromDiffMemConsDConvftoui
-; P9BE: lxvx
-; P9BE: lxvx
+; P9BE: lxv
+; P9BE: lxv
 ; P9BE: vperm
 ; P9BE: xvcvspuxws
 ; P9BE: blr
-; P9LE: lxvx
-; P9LE: lxvx
+; P9LE: lxv
+; P9LE: lxv
 ; P9LE: vperm
 ; P9LE: xvcvspuxws
 ; P9LE: blr
@@ -2715,9 +2715,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvdtoui
 ; P8BE-LABEL: fromDiffConstsConvdtoui
 ; P8LE-LABEL: fromDiffConstsConvdtoui
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -2742,16 +2742,16 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvdtoui
 ; P8BE-LABEL: fromDiffMemConsAConvdtoui
 ; P8LE-LABEL: fromDiffMemConsAConvdtoui
-; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3
-; P9BE: lxvx [[REG2:[vs0-9]+]], r3, r4
+; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9BE: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
 ; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
 ; P9BE: vmrgew v2, [[REG6]], [[REG5]]
 ; P9BE: xvcvspuxws v2, v2
-; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3
-; P9LE: lxvx [[REG2:[vs0-9]+]], r3, r4
+; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9LE: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
@@ -3087,9 +3087,9 @@ entry:
 ; P9LE-LABEL: spltConst1ll
 ; P8BE-LABEL: spltConst1ll
 ; P8LE-LABEL: spltConst1ll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3105,9 +3105,9 @@ entry:
 ; P9LE-LABEL: spltConst16kll
 ; P8BE-LABEL: spltConst16kll
 ; P8LE-LABEL: spltConst16kll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3123,9 +3123,9 @@ entry:
 ; P9LE-LABEL: spltConst32kll
 ; P8BE-LABEL: spltConst32kll
 ; P8LE-LABEL: spltConst32kll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3165,9 +3165,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsll
 ; P8BE-LABEL: fromDiffConstsll
 ; P8LE-LABEL: fromDiffConstsll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3188,9 +3188,9 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAll
 ; P8BE-LABEL: fromDiffMemConsAll
 ; P8LE-LABEL: fromDiffMemConsAll
-; P9BE: lxvx v2
+; P9BE: lxv v2
 ; P9BE: blr
-; P9LE: lxvx v2
+; P9LE: lxv v2
 ; P9LE: blr
 ; P8BE: lxvd2x v2
 ; P8BE: blr
@@ -3213,9 +3213,9 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDll
 ; P8BE-LABEL: fromDiffMemConsDll
 ; P8LE-LABEL: fromDiffMemConsDll
-; P9BE: lxvx v2
+; P9BE: lxv v2
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: xxswapd v2
 ; P9LE: blr
 ; P8BE: lxvd2x
@@ -3275,11 +3275,11 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDll
 ; P8LE-LABEL: fromDiffMemVarDll
 ; P9BE: sldi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: xxswapd v2
 ; P9BE-NEXT: blr
 ; P9LE: sldi
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: xxswapd v2
 ; P9LE-NEXT: blr
 ; P8BE: sldi
@@ -3422,9 +3422,9 @@ entry:
 ; P9LE-LABEL: spltCnstConvftoll
 ; P8BE-LABEL: spltCnstConvftoll
 ; P8LE-LABEL: spltCnstConvftoll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3466,9 +3466,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvftoll
 ; P8BE-LABEL: fromDiffConstsConvftoll
 ; P8LE-LABEL: fromDiffConstsConvftoll
-; P9BE: lxvx v2
+; P9BE: lxv v2
 ; P9BE: blr
-; P9LE: lxvx v2
+; P9LE: lxv v2
 ; P9LE: blr
 ; P8BE: lxvd2x v2
 ; P8BE: blr
@@ -3705,9 +3705,9 @@ entry:
 ; P9LE-LABEL: spltCnstConvdtoll
 ; P8BE-LABEL: spltCnstConvdtoll
 ; P8LE-LABEL: spltCnstConvdtoll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3749,9 +3749,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvdtoll
 ; P8BE-LABEL: fromDiffConstsConvdtoll
 ; P8LE-LABEL: fromDiffConstsConvdtoll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3770,10 +3770,10 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvdtoll
 ; P8BE-LABEL: fromDiffMemConsAConvdtoll
 ; P8LE-LABEL: fromDiffMemConsAConvdtoll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE-NEXT: xvcvdpsxds v2
 ; P9BE-NEXT: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE-NEXT: xvcvdpsxds v2
 ; P9LE-NEXT: blr
 ; P8BE: lxvd2x
@@ -3801,11 +3801,11 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDConvdtoll
 ; P8BE-LABEL: fromDiffMemConsDConvdtoll
 ; P8LE-LABEL: fromDiffMemConsDConvdtoll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE-NEXT: xxswapd
 ; P9BE-NEXT: xvcvdpsxds v2
 ; P9BE-NEXT: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE-NEXT: xxswapd
 ; P9LE-NEXT: xvcvdpsxds v2
 ; P9LE-NEXT: blr
@@ -3876,12 +3876,12 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDConvdtoll
 ; P8LE-LABEL: fromDiffMemVarDConvdtoll
 ; P9BE: sldi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE-NEXT: xxswapd
 ; P9BE-NEXT: xvcvdpsxds v2
 ; P9BE-NEXT: blr
 ; P9LE: sldi
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE-NEXT: xxswapd
 ; P9LE-NEXT: xvcvdpsxds v2
 ; P9LE-NEXT: blr
@@ -3991,9 +3991,9 @@ entry:
 ; P9LE-LABEL: spltConst1ull
 ; P8BE-LABEL: spltConst1ull
 ; P8LE-LABEL: spltConst1ull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4009,9 +4009,9 @@ entry:
 ; P9LE-LABEL: spltConst16kull
 ; P8BE-LABEL: spltConst16kull
 ; P8LE-LABEL: spltConst16kull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4027,9 +4027,9 @@ entry:
 ; P9LE-LABEL: spltConst32kull
 ; P8BE-LABEL: spltConst32kull
 ; P8LE-LABEL: spltConst32kull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4069,9 +4069,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsull
 ; P8BE-LABEL: fromDiffConstsull
 ; P8LE-LABEL: fromDiffConstsull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4092,9 +4092,9 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAull
 ; P8BE-LABEL: fromDiffMemConsAull
 ; P8LE-LABEL: fromDiffMemConsAull
-; P9BE: lxvx v2
+; P9BE: lxv v2
 ; P9BE: blr
-; P9LE: lxvx v2
+; P9LE: lxv v2
 ; P9LE: blr
 ; P8BE: lxvd2x v2
 ; P8BE: blr
@@ -4117,9 +4117,9 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDull
 ; P8BE-LABEL: fromDiffMemConsDull
 ; P8LE-LABEL: fromDiffMemConsDull
-; P9BE: lxvx v2
+; P9BE: lxv v2
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: xxswapd v2
 ; P9LE: blr
 ; P8BE: lxvd2x
@@ -4179,11 +4179,11 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDull
 ; P8LE-LABEL: fromDiffMemVarDull
 ; P9BE: sldi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: xxswapd v2
 ; P9BE-NEXT: blr
 ; P9LE: sldi
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: xxswapd v2
 ; P9LE-NEXT: blr
 ; P8BE: sldi
@@ -4326,9 +4326,9 @@ entry:
 ; P9LE-LABEL: spltCnstConvftoull
 ; P8BE-LABEL: spltCnstConvftoull
 ; P8LE-LABEL: spltCnstConvftoull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4370,9 +4370,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvftoull
 ; P8BE-LABEL: fromDiffConstsConvftoull
 ; P8LE-LABEL: fromDiffConstsConvftoull
-; P9BE: lxvx v2
+; P9BE: lxv v2
 ; P9BE: blr
-; P9LE: lxvx v2
+; P9LE: lxv v2
 ; P9LE: blr
 ; P8BE: lxvd2x v2
 ; P8BE: blr
@@ -4609,9 +4609,9 @@ entry:
 ; P9LE-LABEL: spltCnstConvdtoull
 ; P8BE-LABEL: spltCnstConvdtoull
 ; P8LE-LABEL: spltCnstConvdtoull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4653,9 +4653,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvdtoull
 ; P8BE-LABEL: fromDiffConstsConvdtoull
 ; P8LE-LABEL: fromDiffConstsConvdtoull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4674,10 +4674,10 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvdtoull
 ; P8BE-LABEL: fromDiffMemConsAConvdtoull
 ; P8LE-LABEL: fromDiffMemConsAConvdtoull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE-NEXT: xvcvdpuxds v2
 ; P9BE-NEXT: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE-NEXT: xvcvdpuxds v2
 ; P9LE-NEXT: blr
 ; P8BE: lxvd2x
@@ -4705,11 +4705,11 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDConvdtoull
 ; P8BE-LABEL: fromDiffMemConsDConvdtoull
 ; P8LE-LABEL: fromDiffMemConsDConvdtoull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE-NEXT: xxswapd
 ; P9BE-NEXT: xvcvdpuxds v2
 ; P9BE-NEXT: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE-NEXT: xxswapd
 ; P9LE-NEXT: xvcvdpuxds v2
 ; P9LE-NEXT: blr
@@ -4780,12 +4780,12 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDConvdtoull
 ; P8LE-LABEL: fromDiffMemVarDConvdtoull
 ; P9BE: sldi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE-NEXT: xxswapd
 ; P9BE-NEXT: xvcvdpuxds v2
 ; P9BE-NEXT: blr
 ; P9LE: sldi
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE-NEXT: xxswapd
 ; P9LE-NEXT: xvcvdpuxds v2
 ; P9LE-NEXT: blr
diff --git a/test/CodeGen/PowerPC/livephysregs.mir b/test/CodeGen/PowerPC/livephysregs.mir
new file mode 100644
index 000000000000..6b6268778e99
--- /dev/null
+++ b/test/CodeGen/PowerPC/livephysregs.mir
@@ -0,0 +1,52 @@
+# RUN: llc -o - %s -mtriple=powerpc64le--linux-gnu -run-pass=branch-folder | FileCheck %s
+# The branch-folder should merge bb.1 and bb.5 below and therefore recalculate
+# the liveins list of the merged block. This test is checking whether this
+# recalculated list if okay and contains all the non-saved and saved CSRs.
+# CHECK-LABEL: name: func
+# CHECK: bb.3:
+# CHECK-NEXT: liveins: %x30, %x29, %x3, %x6
+# CHECK: %x4 = RLDICR killed %x6, 16, 47
+# CHECK: %x3 = OR8 killed %x4, killed %x3
+# CHECK: BLR8 implicit %lr8, implicit %rm, implicit %x3
+---
+name: func
+tracksRegLiveness: true
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%x30' }
+  - { id: 1, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%x29' }
+  - { id: 2, offset: -8, size: 8, alignment: 8, isImmutable: true, isAliased: false }
+body: |
+  bb.0:
+    liveins: %x3, %x5, %x29, %x30
+  
+    %x6 = RLWINM8 %x3, 16, 16, 31
+    %x3 = RLDICL killed %x3, 0, 48
+    BC undef %cr5lt, %bb.3
+  
+  bb.1:
+    liveins: %x3, %x6, %x29, %x30
+  
+    %x4 = RLDICR killed %x6, 16, 47
+    %x3 = OR8 killed %x4, killed %x3
+    BLR8 implicit %lr8, implicit %rm, implicit %x3
+  
+  bb.3:
+    liveins: %x3, %x5, %x6, %x29, %x30
+
+    dead %x5 = ADD8 %x5, %x6
+    BC undef %cr5lt, %bb.1
+
+  bb.6:
+    liveins: %x3, %x6, %x29, %x30
+    STD killed %x29, -24, %x1 :: (store 8 into %fixed-stack.1)
+    STD killed %x30, -16, %x1 :: (store 8 into %fixed-stack.0, align 16)
+    NOP implicit-def dead %x29
+    NOP implicit-def dead %x30
+
+    %x30 = LD -16, %x1 :: (load 8 from %fixed-stack.0, align 16)
+    %x29 = LD -24, %x1 :: (load 8 from %fixed-stack.1)
+  
+    %x4 = RLDICR killed %x6, 16, 47
+    %x3 = OR8 killed %x4, killed %x3
+    BLR8 implicit %lr8, implicit %rm, implicit %x3 
+...
diff --git a/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll b/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll
index 329f5bb59cb1..de930af75b2d 100644
--- a/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll
+++ b/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll
@@ -21,7 +21,7 @@ entry:
   ret <16 x i8> %strided.vec
 
 ; CHECK-LABEL: @test2
-; CHECK: vsldoi 2, 2, 2, 12
+; CHECK: xxsldwi 34, 34, 34, 3
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll b/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll
index e3326595d132..fe34bcb85637 100644
--- a/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll
+++ b/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll
@@ -6,7 +6,7 @@
 define <4 x float> @_Z7testInsILj0ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -45,7 +45,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 0
   %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
   ret <4 x float> %vecins
@@ -54,7 +54,7 @@ entry:
 define <4 x float> @_Z7testInsILj1ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -93,7 +93,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 4
   %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
   ret <4 x float> %vecins
@@ -102,7 +102,7 @@ entry:
 define <4 x float> @_Z7testInsILj2ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -141,7 +141,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 8
   %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
   ret <4 x float> %vecins
@@ -150,7 +150,7 @@ entry:
 define <4 x float> @_Z7testInsILj3ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -189,7 +189,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 12
   %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   ret <4 x float> %vecins
@@ -198,7 +198,7 @@ entry:
 define <4 x i32> @_Z7testInsILj0ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -237,7 +237,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 0
   %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
   ret <4 x i32> %vecins
@@ -246,7 +246,7 @@ entry:
 define <4 x i32> @_Z7testInsILj1ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -285,7 +285,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 4
   %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
   ret <4 x i32> %vecins
@@ -294,7 +294,7 @@ entry:
 define <4 x i32> @_Z7testInsILj2ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -333,7 +333,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 8
   %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
   ret <4 x i32> %vecins
@@ -342,7 +342,7 @@ entry:
 define <4 x i32> @_Z7testInsILj3ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -381,7 +381,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 12
   %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   ret <4 x i32> %vecins
@@ -546,7 +546,7 @@ entry:
 define <4 x float> @_Z7testInsILj0ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -585,7 +585,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 0
   %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 3, i32 5, i32 6, i32 7>
   ret <4 x float> %vecins
@@ -594,7 +594,7 @@ entry:
 define <4 x float> @_Z7testInsILj1ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -633,7 +633,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 4
   %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 3, i32 6, i32 7>
   ret <4 x float> %vecins
@@ -642,7 +642,7 @@ entry:
 define <4 x float> @_Z7testInsILj2ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -681,7 +681,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 8
   %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 3, i32 7>
   ret <4 x float> %vecins
@@ -690,7 +690,7 @@ entry:
 define <4 x float> @_Z7testInsILj3ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -729,7 +729,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 12
   %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
   ret <4 x float> %vecins
@@ -738,7 +738,7 @@ entry:
 define <4 x i32> @_Z7testInsILj0ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -777,7 +777,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 0
   %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 3, i32 5, i32 6, i32 7>
   ret <4 x i32> %vecins
@@ -786,7 +786,7 @@ entry:
 define <4 x i32> @_Z7testInsILj1ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -825,7 +825,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 4
   %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 3, i32 6, i32 7>
   ret <4 x i32> %vecins
@@ -834,7 +834,7 @@ entry:
 define <4 x i32> @_Z7testInsILj2ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -873,7 +873,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 8
   %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 3, i32 7>
   ret <4 x i32> %vecins
@@ -882,7 +882,7 @@ entry:
 define <4 x i32> @_Z7testInsILj3ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -921,7 +921,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 12
   %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
   ret <4 x i32> %vecins
@@ -972,10 +972,10 @@ define <4 x float> @insertVarF(<4 x float> %a, float %f, i32 %el) {
 entry:
 ; CHECK-LABEL: insertVarF
 ; CHECK: stxsspx 1,
-; CHECK: lxvx
+; CHECK: lxv
 ; CHECK-BE-LABEL: insertVarF
 ; CHECK-BE: stxsspx 1,
-; CHECK-BE: lxvx
+; CHECK-BE: lxv
   %vecins = insertelement <4 x float> %a, float %f, i32 %el
   ret <4 x float> %vecins
 }
@@ -983,10 +983,10 @@ define <4 x i32> @insertVarI(<4 x i32> %a, i32 %i, i32 %el) {
 entry:
 ; CHECK-LABEL: insertVarI
 ; CHECK: stwx
-; CHECK: lxvx
+; CHECK: lxv
 ; CHECK-BE-LABEL: insertVarI
 ; CHECK-BE: stwx
-; CHECK-BE: lxvx
+; CHECK-BE: lxv
   %vecins = insertelement <4 x i32> %a, i32 %i, i32 %el
   ret <4 x i32> %vecins
 }
diff --git a/test/CodeGen/PowerPC/ppc64-i128-abi.ll b/test/CodeGen/PowerPC/ppc64-i128-abi.ll
index 4a8fd90db3eb..90dd1d84fc23 100644
--- a/test/CodeGen/PowerPC/ppc64-i128-abi.ll
+++ b/test/CodeGen/PowerPC/ppc64-i128-abi.ll
@@ -63,7 +63,7 @@ define <1 x i128> @v1i128_increment_by_one(<1 x i128> %a) nounwind {
 ; FIXME: li [[R1:r[0-9]+]], 1
 ; FIXME: li [[R2:r[0-9]+]], 0
 ; FIXME: mtvsrdd [[V1:v[0-9]+]], [[R2]], [[R1]]
-; CHECK-P9: lxvx [[V1:v[0-9]+]]
+; CHECK-P9: lxv [[V1:v[0-9]+]]
 ; CHECK-P9: vadduqm v2, v2, [[V1]]
 ; CHECK-P9: blr
 
@@ -207,7 +207,7 @@ define <1 x i128> @call_v1i128_increment_by_one() nounwind {
 ; CHECK-LE: blr
 
 ; CHECK-P9-LABEL: @call_v1i128_increment_by_one
-; CHECK-P9: lxvx
+; CHECK-P9: lxv
 ; CHECK-P9: bl v1i128_increment_by_one
 ; CHECK-P9: blr
 
@@ -237,8 +237,8 @@ define <1 x i128> @call_v1i128_increment_by_val() nounwind {
 ; CHECK-LE: blr
 
 ; CHECK-P9-LABEL: @call_v1i128_increment_by_val
-; CHECK-P9-DAG: lxvx v2
-; CHECK-P9-DAG: lxvx v3
+; CHECK-P9-DAG: lxv v2
+; CHECK-P9-DAG: lxv v3
 ; CHECK-P9: bl v1i128_increment_by_val
 ; CHECK-P9: blr
 
diff --git a/test/CodeGen/PowerPC/pr25157-peephole.ll b/test/CodeGen/PowerPC/pr25157-peephole.ll
index 7f959add00f6..aacd64e401f5 100644
--- a/test/CodeGen/PowerPC/pr25157-peephole.ll
+++ b/test/CodeGen/PowerPC/pr25157-peephole.ll
@@ -65,5 +65,5 @@ L.LB38_2452:
 ; CHECK-P9-LABEL: @aercalc_
 ; CHECK-P9: lfs
 ; CHECK-P9: xxspltd
-; CHECK-P9: stxvx
+; CHECK-P9: stxv
 ; CHECK-P9-NOT: xxswapd
diff --git a/test/CodeGen/PowerPC/pr27078.ll b/test/CodeGen/PowerPC/pr27078.ll
index b100e3a5ba53..d97008ee5578 100644
--- a/test/CodeGen/PowerPC/pr27078.ll
+++ b/test/CodeGen/PowerPC/pr27078.ll
@@ -9,11 +9,11 @@ define <4 x float> @bar(float* %p, float* %q) {
   %6 = shufflevector <12 x float> %5, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   ret <4 x float>  %6
 
-; CHECK: vsldoi
+; CHECK: xxsldwi
 ; CHECK-NEXT: vmrghw
 ; CHECK-NEXT: vmrglw
-; CHECK-NEXT: vsldoi
-; CHECK-NEXT: vsldoi
-; CHECK-NEXT: vsldoi
+; CHECK-NEXT: xxsldwi
+; CHECK-NEXT: xxsldwi
+; CHECK-NEXT: xxsldwi
 ; CHECK-NEXT: blr
 }
diff --git a/test/CodeGen/PowerPC/swaps-le-6.ll b/test/CodeGen/PowerPC/swaps-le-6.ll
index d573441f2cc9..e7640cab6aef 100644
--- a/test/CodeGen/PowerPC/swaps-le-6.ll
+++ b/test/CodeGen/PowerPC/swaps-le-6.ll
@@ -33,11 +33,11 @@ entry:
 ; CHECK: stxvd2x [[REG5]]
 
 ; CHECK-P9-LABEL: @bar0
-; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]]
+; CHECK-P9-DAG: lxv [[REG1:[0-9]+]]
 ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3)
 ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
 ; CHECK-P9: xxpermdi [[REG5:[0-9]+]], [[REG1]], [[REG4]], 1
-; CHECK-P9: stxvx [[REG5]]
+; CHECK-P9: stxv [[REG5]]
 
 define void @bar1() {
 entry:
@@ -56,9 +56,9 @@ entry:
 ; CHECK: stxvd2x [[REG5]]
 
 ; CHECK-P9-LABEL: @bar1
-; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]]
+; CHECK-P9-DAG: lxv [[REG1:[0-9]+]]
 ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3)
 ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
 ; CHECK-P9: xxmrgld [[REG5:[0-9]+]], [[REG4]], [[REG1]]
-; CHECK-P9: stxvx [[REG5]]
+; CHECK-P9: stxv [[REG5]]
 
diff --git a/test/CodeGen/PowerPC/vec_sldwi.ll b/test/CodeGen/PowerPC/vec_sldwi.ll
new file mode 100644
index 000000000000..01537d1f5927
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_sldwi.ll
@@ -0,0 +1,307 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | \
+; RUN:   FileCheck %s  -check-prefix=CHECK-LE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | \
+; RUN:   FileCheck %s -check-prefix=CHECK-BE
+
+; Possible LE ShuffleVector masks (Case 1):
+; ShuffleVector((vector int)a, vector(int)b, 0, 1, 2, 3)
+; ShuffleVector((vector int)a, vector(int)b, 7, 0, 1, 2)
+; ShuffleVector((vector int)a, vector(int)b, 6, 7, 0, 1)
+; ShuffleVector((vector int)a, vector(int)b, 5, 6, 7, 0)
+; which targets at:
+; xxsldwi a, b, 0
+; xxsldwi a, b, 1
+; xxsldwi a, b, 2
+; xxsldwi a, b, 3
+; Possible LE Swap ShuffleVector masks (Case 2):
+; ShuffleVector((vector int)a, vector(int)b, 4, 5, 6, 7)
+; ShuffleVector((vector int)a, vector(int)b, 3, 4, 5, 6)
+; ShuffleVector((vector int)a, vector(int)b, 2, 3, 4, 5)
+; ShuffleVector((vector int)a, vector(int)b, 1, 2, 3, 4)
+; which targets at:
+; xxsldwi b, a, 0
+; xxsldwi b, a, 1
+; xxsldwi b, a, 2
+; xxsldwi b, a, 3
+; Possible LE ShuffleVector masks when a == b, b is undef (Case 3):
+; ShuffleVector((vector int)a, vector(int)a, 0, 1, 2, 3)
+; ShuffleVector((vector int)a, vector(int)a, 3, 0, 1, 2)
+; ShuffleVector((vector int)a, vector(int)a, 2, 3, 0, 1)
+; ShuffleVector((vector int)a, vector(int)a, 1, 2, 3, 0)
+; which targets at:
+; xxsldwi a, a, 0
+; xxsldwi a, a, 1
+; xxsldwi a, a, 2
+; xxsldwi a, a, 3
+
+; Possible BE ShuffleVector masks (Case 4):
+; ShuffleVector((vector int)a, vector(int)b, 0, 1, 2, 3)
+; ShuffleVector((vector int)a, vector(int)b, 1, 2, 3, 4)
+; ShuffleVector((vector int)a, vector(int)b, 2, 3, 4, 5)
+; ShuffleVector((vector int)a, vector(int)b, 3, 4, 5, 6)
+; which targets at:
+; xxsldwi b, a, 0
+; xxsldwi b, a, 1
+; xxsldwi a, a, 2
+; xxsldwi a, a, 3
+; Possible BE Swap ShuffleVector masks (Case 5):
+; ShuffleVector((vector int)a, vector(int)b, 4, 5, 6, 7)
+; ShuffleVector((vector int)a, vector(int)b, 5, 6, 7, 0)
+; ShuffleVector((vector int)a, vector(int)b, 6, 7, 0, 1)
+; ShuffleVector((vector int)a, vector(int)b, 7, 0, 1, 2)
+; which targets at:
+; xxsldwi b, a, 0
+; xxsldwi b, a, 1
+; xxsldwi b, a, 2
+; xxsldwi b, a, 3
+; Possible BE ShuffleVector masks when a == b, b is undef (Case 6):
+; ShuffleVector((vector int)a, vector(int)b, 0, 1, 2, 3)
+; ShuffleVector((vector int)a, vector(int)a, 1, 2, 3, 0)
+; ShuffleVector((vector int)a, vector(int)a, 2, 3, 0, 1)
+; ShuffleVector((vector int)a, vector(int)a, 3, 0, 1, 2)
+; which targets at:
+; xxsldwi a, a, 0
+; xxsldwi a, a, 1
+; xxsldwi a, a, 2
+; xxsldwi a, a, 3
+
+define <4 x i32> @check_le_vec_sldwi_va_vb_0(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_vb_0
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_vb_1(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_vb_1
+; CHECK-LE: xxsldwi 34, 34, 35, 1
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_vb_2(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_vb_2
+; CHECK-LE: xxsldwi 34, 34, 35, 2
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_vb_3(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_vb_3
+; CHECK-LE: xxsldwi 34, 34, 35, 3
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_swap_vec_sldwi_va_vb_0(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_swap_vec_sldwi_va_vb_0
+; CHECK-LE; vmr 2, 3
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_swap_vec_sldwi_va_vb_1(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_swap_vec_sldwi_va_vb_1
+; CHECK-LE: xxsldwi 34, 35, 34, 1
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_swap_vec_sldwi_va_vb_2(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_swap_vec_sldwi_va_vb_2
+; CHECK-LE: xxsldwi 34, 35, 34, 2
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_swap_vec_sldwi_va_vb_3(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_swap_vec_sldwi_va_vb_3
+; CHECK-LE: xxsldwi 34, 35, 34, 3
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_undef_0(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_undef_0
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_undef_1(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_le_vec_sldwi_va_undef_1
+; CHECK-LE: xxsldwi 34, 34, 34, 1
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_undef_2(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_undef_2
+; CHECK-LE: xxswapd 34, 34
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_undef_3(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_undef_3
+; CHECK-LE: xxsldwi 34, 34, 34, 3
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_vb_0(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_vb_0
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_vb_1(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_vb_1
+; CHECK-BE: xxsldwi 34, 34, 35, 1
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_vb_2(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_vb_2
+; CHECK-BE: xxsldwi 34, 34, 35, 2
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_vb_3(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_vb_3
+; CHECK-BE: xxsldwi 34, 34, 35, 3
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_swap_vec_sldwi_va_vb_0(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_swap_vec_sldwi_va_vb_0
+; CHECK-LE; vmr 2, 3
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_swap_vec_sldwi_va_vb_1(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_swap_vec_sldwi_va_vb_1
+; CHECK-BE: xxsldwi 34, 35, 34, 1
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_swap_vec_sldwi_va_vb_2(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_swap_vec_sldwi_va_vb_2
+; CHECK-BE: xxsldwi 34, 35, 34, 2
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_swap_vec_sldwi_va_vb_3(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_swap_vec_sldwi_va_vb_3
+; CHECK-BE: xxsldwi 34, 35, 34, 3
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_undef_0(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_be_vec_sldwi_va_undef_0
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_undef_1(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_undef_1
+; CHECK-BE: xxsldwi 34, 34, 34, 1
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_undef_2(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_undef_2
+; CHECK-BE: xxswapd 34, 34
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_undef_3(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_undef_3
+; CHECK-BE: xxsldwi 34, 34, 34, 3
+; CHECK-BE: blr
+}
+
+; More test cases to test different types of vector inputs
+define <16 x i8> @test_le_vec_sldwi_v16i8_v16i8(<16 x i8> %VA, <16 x i8> %VB) {
+     entry:
+      %0 = shufflevector <16 x i8> %VA, <16 x i8> %VB,<16 x i32> <i32 28, i32 29, i32 30, i32 31,i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+      ret <16 x i8> %0
+; CHECK-LE-LABEL: @test_le_vec_sldwi_v16i8_v16i8
+; CHECK-LE: xxsldwi 34, 34, 35, 1
+; CHECK-LE: blr
+}
+
+define <8 x i16> @test_le_vec_sldwi_v8i16_v8i16(<8 x i16> %VA, <8 x i16> %VB) {
+     entry:
+      %0 = shufflevector <8 x i16> %VA, <8 x i16> %VB,<8 x i32> <i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+      ret <8 x i16> %0
+; CHECK-LE-LABEL: @test_le_vec_sldwi_v8i16_v8i16
+; CHECK-LE: xxsldwi 34, 34, 35, 1
+; CHECK-LE: blr
+}
+
+; Note here xxpermdi 34, 34, 35, 2 <=> xxsldwi 34, 34, 35, 2
+define <2 x i64> @test_be_vec_sldwi_v2i64_v2i64(<2 x i64> %VA, <2 x i64> %VB) {
+     entry:
+      %0 = shufflevector <2 x i64> %VA, <2 x i64> %VB,<2 x i32> <i32 3, i32 0>
+      ret <2 x i64> %0
+; CHECK-LE-LABEL: @test_be_vec_sldwi_v2i64_v2i64
+; CHECK-LE: xxpermdi 34, 34, 35, 2
+; CHECK-LE: blr
+}
diff --git a/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll b/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll
index acedc2606331..0f0426526cc1 100644
--- a/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll
+++ b/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll
@@ -26,82 +26,82 @@ entry:
 ; CHECK-LABEL: test1
 ; CHECK-P9-LABEL: test1
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %0 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* bitcast (<4 x i32>* @vsi to i8*))
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   store <4 x i32> %0, <4 x i32>* @res_vsi, align 16
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %1 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* bitcast (<4 x i32>* @vui to i8*))
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   store <4 x i32> %1, <4 x i32>* @res_vui, align 16
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %2 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* bitcast (<4 x float>* @vf to i8*))
   %3 = bitcast <4 x i32> %2 to <4 x float>
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   store <4 x float> %3, <4 x float>* @res_vf, align 16
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %4 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* bitcast (<2 x i64>* @vsll to i8*))
   %5 = bitcast <2 x double> %4 to <2 x i64>
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   store <2 x i64> %5, <2 x i64>* @res_vsll, align 16
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %6 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* bitcast (<2 x i64>* @vull to i8*))
   %7 = bitcast <2 x double> %6 to <2 x i64>
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   store <2 x i64> %7, <2 x i64>* @res_vull, align 16
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %8 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* bitcast (<2 x double>* @vd to i8*))
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   store <2 x double> %8, <2 x double>* @res_vd, align 16
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %9 = load <4 x i32>, <4 x i32>* @vsi, align 16
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %9, i8* bitcast (<4 x i32>* @res_vsi to i8*))
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %10 = load <4 x i32>, <4 x i32>* @vui, align 16
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %10, i8* bitcast (<4 x i32>* @res_vui to i8*))
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %11 = load <4 x float>, <4 x float>* @vf, align 16
   %12 = bitcast <4 x float> %11 to <4 x i32>
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %12, i8* bitcast (<4 x float>* @res_vf to i8*))
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %13 = load <2 x i64>, <2 x i64>* @vsll, align 16
   %14 = bitcast <2 x i64> %13 to <2 x double>
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   call void @llvm.ppc.vsx.stxvd2x(<2 x double> %14, i8* bitcast (<2 x i64>* @res_vsll to i8*))
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %15 = load <2 x i64>, <2 x i64>* @vull, align 16
   %16 = bitcast <2 x i64> %15 to <2 x double>
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   call void @llvm.ppc.vsx.stxvd2x(<2 x double> %16, i8* bitcast (<2 x i64>* @res_vull to i8*))
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %17 = load <2 x double>, <2 x double>* @vd, align 16
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   call void @llvm.ppc.vsx.stxvd2x(<2 x double> %17, i8* bitcast (<2 x double>* @res_vd to i8*))
   ret void
 }
diff --git a/test/CodeGen/PowerPC/vsx-ldst.ll b/test/CodeGen/PowerPC/vsx-ldst.ll
index d8dd635aab5f..0bbc633363a7 100644
--- a/test/CodeGen/PowerPC/vsx-ldst.ll
+++ b/test/CodeGen/PowerPC/vsx-ldst.ll
@@ -21,8 +21,8 @@
 
 ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -O2 \
 ; RUN:   -mtriple=powerpc64le-unknown-linux-gnu < %s > %t
-; RUN: grep lxvx < %t | count 6
-; RUN: grep stxvx < %t | count 6
+; RUN: grep lxv < %t | count 6
+; RUN: grep stxv < %t | count 6
 
 
 @vsi = global <4 x i32> <i32 -1, i32 2, i32 -3, i32 4>, align 16
diff --git a/test/CodeGen/PowerPC/vsx-p9.ll b/test/CodeGen/PowerPC/vsx-p9.ll
index ba359501ccc5..0c29b6adad77 100644
--- a/test/CodeGen/PowerPC/vsx-p9.ll
+++ b/test/CodeGen/PowerPC/vsx-p9.ll
@@ -36,109 +36,109 @@ entry:
   %1 = load <16 x i8>, <16 x i8>* @ucb, align 16
   %add.i = add <16 x i8> %1, %0
   tail call void (...) @sink(<16 x i8> %add.i)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vaddubm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %2 = load <16 x i8>, <16 x i8>* @sca, align 16
   %3 = load <16 x i8>, <16 x i8>* @scb, align 16
   %add.i22 = add <16 x i8> %3, %2
   tail call void (...) @sink(<16 x i8> %add.i22)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vaddubm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %4 = load <8 x i16>, <8 x i16>* @usa, align 16
   %5 = load <8 x i16>, <8 x i16>* @usb, align 16
   %add.i21 = add <8 x i16> %5, %4
   tail call void (...) @sink(<8 x i16> %add.i21)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vadduhm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %6 = load <8 x i16>, <8 x i16>* @ssa, align 16
   %7 = load <8 x i16>, <8 x i16>* @ssb, align 16
   %add.i20 = add <8 x i16> %7, %6
   tail call void (...) @sink(<8 x i16> %add.i20)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vadduhm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %8 = load <4 x i32>, <4 x i32>* @uia, align 16
   %9 = load <4 x i32>, <4 x i32>* @uib, align 16
   %add.i19 = add <4 x i32> %9, %8
   tail call void (...) @sink(<4 x i32> %add.i19)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vadduwm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %10 = load <4 x i32>, <4 x i32>* @sia, align 16
   %11 = load <4 x i32>, <4 x i32>* @sib, align 16
   %add.i18 = add <4 x i32> %11, %10
   tail call void (...) @sink(<4 x i32> %add.i18)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vadduwm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %12 = load <2 x i64>, <2 x i64>* @ulla, align 16
   %13 = load <2 x i64>, <2 x i64>* @ullb, align 16
   %add.i17 = add <2 x i64> %13, %12
   tail call void (...) @sink(<2 x i64> %add.i17)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vaddudm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %14 = load <2 x i64>, <2 x i64>* @slla, align 16
   %15 = load <2 x i64>, <2 x i64>* @sllb, align 16
   %add.i16 = add <2 x i64> %15, %14
   tail call void (...) @sink(<2 x i64> %add.i16)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vaddudm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %16 = load <1 x i128>, <1 x i128>* @uxa, align 16
   %17 = load <1 x i128>, <1 x i128>* @uxb, align 16
   %add.i15 = add <1 x i128> %17, %16
   tail call void (...) @sink(<1 x i128> %add.i15)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vadduqm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %18 = load <1 x i128>, <1 x i128>* @sxa, align 16
   %19 = load <1 x i128>, <1 x i128>* @sxb, align 16
   %add.i14 = add <1 x i128> %19, %18
   tail call void (...) @sink(<1 x i128> %add.i14)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vadduqm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %20 = load <4 x float>, <4 x float>* @vfa, align 16
   %21 = load <4 x float>, <4 x float>* @vfb, align 16
   %add.i13 = fadd <4 x float> %20, %21
   tail call void (...) @sink(<4 x float> %add.i13)
-; CHECK: lxvx 0, 0, 3
-; CHECK: lxvx 1, 0, 4
+; CHECK: lxv 0, 0(3)
+; CHECK: lxv 1, 0(4)
 ; CHECK: xvaddsp 34, 0, 1
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %22 = load <2 x double>, <2 x double>* @vda, align 16
   %23 = load <2 x double>, <2 x double>* @vdb, align 16
   %add.i12 = fadd <2 x double> %22, %23
   tail call void (...) @sink(<2 x double> %add.i12)
-; CHECK: lxvx 0, 0, 3
-; CHECK: lxvx 1, 0, 4
+; CHECK: lxv 0, 0(3)
+; CHECK: lxv 1, 0(4)
 ; CHECK: xvadddp 0, 0, 1
-; CHECK: stxvx 0,
+; CHECK: stxv 0,
 ; CHECK: bl sink
   ret void
 }
diff --git a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
index 09bf6830416f..98fe3a813cb7 100644
--- a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
+++ b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
@@ -23,7 +23,7 @@ define <2 x double> @testi0(<2 x double>* %p1, double* %p2) {
 
 ; CHECK-P9-LABEL: testi0
 ; CHECK-P9: lfd [[REG1:[0-9]+]], 0(4)
-; CHECK-P9: lxvx [[REG2:[0-9]+]], 0, 3
+; CHECK-P9: lxv [[REG2:[0-9]+]], 0(3)
 ; CHECK-P9: xxspltd [[REG3:[0-9]+]], [[REG1]], 0
 ; CHECK-P9: xxpermdi 34, [[REG2]], [[REG3]], 1
 }
@@ -43,7 +43,7 @@ define <2 x double> @testi1(<2 x double>* %p1, double* %p2) {
 
 ; CHECK-P9-LABEL: testi1
 ; CHECK-P9: lfd [[REG1:[0-9]+]], 0(4)
-; CHECK-P9: lxvx [[REG2:[0-9]+]], 0, 3
+; CHECK-P9: lxv [[REG2:[0-9]+]], 0(3)
 ; CHECK-P9: xxspltd [[REG3:[0-9]+]], [[REG1]], 0
 ; CHECK-P9: xxmrgld 34, [[REG3]], [[REG2]]
 }
diff --git a/test/CodeGen/PowerPC/vsx_shuffle_le.ll b/test/CodeGen/PowerPC/vsx_shuffle_le.ll
index 3bf24adfdd91..cfe201999282 100644
--- a/test/CodeGen/PowerPC/vsx_shuffle_le.ll
+++ b/test/CodeGen/PowerPC/vsx_shuffle_le.ll
@@ -19,7 +19,7 @@ define <2 x double> @test00(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxspltd 34, 0, 0
 
 ; CHECK-P9-LABEL: test00
-; CHECK-P9: lxvx 0, 0, 3
+; CHECK-P9: lxv 0, 0(3)
 ; CHECK-P9: xxspltd 34, 0, 1
 }
 
@@ -34,7 +34,7 @@ define <2 x double> @test01(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxswapd 34, 0
 
 ; CHECK-P9-LABEL: test01
-; CHECK-P9: lxvx 34, 0, 3
+; CHECK-P9: lxv 34, 0(3)
 }
 
 define <2 x double> @test02(<2 x double>* %p1, <2 x double>* %p2) {
@@ -51,8 +51,8 @@ define <2 x double> @test02(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxmrgld 34, 1, 0
 
 ; CHECK-P9-LABEL: @test02
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxmrgld 34, 1, 0
 }
 
@@ -70,8 +70,8 @@ define <2 x double> @test03(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxpermdi 34, 1, 0, 1
 
 ; CHECK-P9-LABEL: @test03
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxpermdi 34, 1, 0, 1
 }
 
@@ -85,7 +85,7 @@ define <2 x double> @test10(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: lxvd2x 34, 0, 3
 
 ; CHECK-P9-LABEL: @test10
-; CHECK-P9: lxvx 0, 0, 3
+; CHECK-P9: lxv 0, 0(3)
 ; CHECK-P9: xxswapd 34, 0
 }
 
@@ -100,7 +100,7 @@ define <2 x double> @test11(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxspltd 34, 0, 1
 
 ; CHECK-P9-LABEL: @test11
-; CHECK-P9: lxvx 0, 0, 3
+; CHECK-P9: lxv 0, 0(3)
 ; CHECK-P9: xxspltd 34, 0, 0
 }
 
@@ -118,8 +118,8 @@ define <2 x double> @test12(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxpermdi 34, 1, 0, 2
 
 ; CHECK-P9-LABEL: @test12
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxpermdi 34, 1, 0, 2
 }
 
@@ -137,8 +137,8 @@ define <2 x double> @test13(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxmrghd 34, 1, 0
 
 ; CHECK-P9-LABEL: @test13
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxmrghd 34, 1, 0
 }
 
@@ -156,8 +156,8 @@ define <2 x double> @test20(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxmrgld 34, 0, 1
 
 ; CHECK-P9-LABEL: @test20
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxmrgld 34, 0, 1
 }
 
@@ -175,8 +175,8 @@ define <2 x double> @test21(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxpermdi 34, 0, 1, 1
 
 ; CHECK-P9-LABEL: @test21
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxpermdi 34, 0, 1, 1
 }
 
@@ -191,7 +191,7 @@ define <2 x double> @test22(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxspltd 34, 0, 0
 
 ; CHECK-P9-LABEL: @test22
-; CHECK-P9: lxvx 0, 0, 4
+; CHECK-P9: lxv 0, 0(4)
 ; CHECK-P9: xxspltd 34, 0, 1
 }
 
@@ -206,7 +206,7 @@ define <2 x double> @test23(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxswapd 34, 0
 
 ; CHECK-P9-LABEL: @test23
-; CHECK-P9: lxvx 34, 0, 4
+; CHECK-P9: lxv 34, 0(4)
 }
 
 define <2 x double> @test30(<2 x double>* %p1, <2 x double>* %p2) {
@@ -223,8 +223,8 @@ define <2 x double> @test30(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxpermdi 34, 0, 1, 2
 
 ; CHECK-P9-LABEL: @test30
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxpermdi 34, 0, 1, 2
 }
 
@@ -242,8 +242,8 @@ define <2 x double> @test31(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxmrghd 34, 0, 1
 
 ; CHECK-P9-LABEL: @test31
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxmrghd 34, 0, 1
 }
 
@@ -257,7 +257,7 @@ define <2 x double> @test32(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: lxvd2x 34, 0, 4
 
 ; CHECK-P9-LABEL: @test32
-; CHECK-P9: lxvx 0, 0, 4
+; CHECK-P9: lxv 0, 0(4)
 ; CHECK-P9: xxswapd 34, 0
 }
 
@@ -272,6 +272,6 @@ define <2 x double> @test33(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxspltd 34, 0, 1
 
 ; CHECK-P9-LABEL: @test33
-; CHECK-P9: lxvx 0, 0, 4
+; CHECK-P9: lxv 0, 0(4)
 ; CHECK-P9: xxspltd 34, 0, 0
 }
diff --git a/test/CodeGen/Thumb/machine-cse-physreg.mir b/test/CodeGen/Thumb/machine-cse-physreg.mir
new file mode 100644
index 000000000000..5206e89cf779
--- /dev/null
+++ b/test/CodeGen/Thumb/machine-cse-physreg.mir
@@ -0,0 +1,35 @@
+# RUN: llc -mtriple thumbv5e -run-pass=machine-cse -o - %s | FileCheck %s
+
+# This is a contrived example made to expose a bug in
+# MachineCSE, see PR32538.
+
+# MachineCSE must not remove this def of %cpsr:
+# CHECK-LABEL: bb.1:
+# CHECK: , %cpsr = tLSLri
+
+...
+---
+name:            spam
+registers:
+  - { id: 0, class: tgpr }
+  - { id: 1, class: tgpr }
+  - { id: 2, class: tgpr }
+  - { id: 3, class: tgpr }
+liveins:
+  - { reg: '%r0', virtual-reg: '%0' }
+body:             |
+  bb.0:
+    liveins: %r0
+    %0 = COPY %r0
+    %1, %cpsr = tLSLri %0, 2, 14, _
+    tCMPi8 %0, 5, 14, _, implicit-def %cpsr
+    tBcc %bb.8, 8, %cpsr
+
+  bb.1:
+    %2, %cpsr = tLSLri %0, 2, 14, _
+
+  bb.8:
+    liveins: %cpsr
+    %3 = COPY %cpsr
+    tSTRi killed %3, %0, 0, 14, _
+...
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 172a00a7c86f..89cb71a52c04 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machine-licm"
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machinelicm"
 ; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s
 ; rdar://6627786
 ; rdar://7792037
diff --git a/test/CodeGen/X86/GlobalISel/memop-vec.ll b/test/CodeGen/X86/GlobalISel/memop-vec.ll
index f1ffc15f4d03..870e812bbb69 100644
--- a/test/CodeGen/X86/GlobalISel/memop-vec.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-vec.ll
@@ -1,39 +1,116 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx                       -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx                       -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX
 
 define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) {
-; ALL-LABEL: test_load_v4i32_noalign:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovups (%rdi), %xmm0
-; ALL-NEXT:    retq
+; SKX-LABEL: test_load_v4i32_noalign:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups (%rdi), %xmm0
+; SKX-NEXT:    retq
   %r = load <4 x i32>, <4 x i32>* %p1, align 1
   ret <4 x i32> %r
 }
 
 define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) {
-; ALL-LABEL: test_load_v4i32_align:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovaps (%rdi), %xmm0
-; ALL-NEXT:    retq
+; SKX-LABEL: test_load_v4i32_align:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps (%rdi), %xmm0
+; SKX-NEXT:    retq
   %r = load <4 x i32>, <4 x i32>* %p1, align 16
   ret <4 x i32> %r
 }
 
+define <8 x i32> @test_load_v8i32_noalign(<8 x i32> * %p1) {
+; SKX-LABEL: test_load_v8i32_noalign:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups (%rdi), %ymm0
+; SKX-NEXT:    retq
+  %r = load <8 x i32>, <8 x i32>* %p1, align 1
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @test_load_v8i32_align(<8 x i32> * %p1) {
+; SKX-LABEL: test_load_v8i32_align:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps (%rdi), %ymm0
+; SKX-NEXT:    retq
+  %r = load <8 x i32>, <8 x i32>* %p1, align 32
+  ret <8 x i32> %r
+}
+
+define <16 x i32> @test_load_v16i32_noalign(<16 x i32> * %p1) {
+; SKX-LABEL: test_load_v16i32_noalign:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups (%rdi), %zmm0
+; SKX-NEXT:    retq
+  %r = load <16 x i32>, <16 x i32>* %p1, align 1
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @test_load_v16i32_align(<16 x i32> * %p1) {
+; SKX-LABEL: test_load_v16i32_align:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups (%rdi), %zmm0
+; SKX-NEXT:    retq
+  %r = load <16 x i32>, <16 x i32>* %p1, align 32
+  ret <16 x i32> %r
+}
+
 define void @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) {
-; ALL-LABEL: test_store_v4i32_noalign:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovups %xmm0, (%rdi)
-; ALL-NEXT:    retq
+; SKX-LABEL: test_store_v4i32_noalign:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    retq
   store <4 x i32> %val, <4 x i32>* %p1, align 1
   ret void
 }
 
 define void @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) {
-; ALL-LABEL: test_store_v4i32_align:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovaps %xmm0, (%rdi)
-; ALL-NEXT:    retq
+; SKX-LABEL: test_store_v4i32_align:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps %xmm0, (%rdi)
+; SKX-NEXT:    retq
   store <4 x i32> %val, <4 x i32>* %p1, align 16
   ret void
 }
+
+define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) {
+; SKX-LABEL: test_store_v8i32_noalign:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  store <8 x i32> %val, <8 x i32>* %p1, align 1
+  ret void
+}
+
+define void @test_store_v8i32_align(<8 x i32> %val, <8 x i32>* %p1) {
+; SKX-LABEL: test_store_v8i32_align:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  store <8 x i32> %val, <8 x i32>* %p1, align 32
+  ret void
+}
+
+define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) {
+; SKX-LABEL: test_store_v16i32_noalign:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups %zmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  store <16 x i32> %val, <16 x i32>* %p1, align 1
+  ret void
+}
+
+define void @test_store_v16i32_align(<16 x i32> %val, <16 x i32>* %p1) {
+; SKX-LABEL: test_store_v16i32_align:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps %zmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  store <16 x i32> %val, <16 x i32>* %p1, align 64
+  ret void
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir b/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir
index f925c836f3d1..cc03f3a57f0b 100644
--- a/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir
+++ b/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir
@@ -14,7 +14,16 @@
     ret void
   }
 
-...
+  define <8 x i32> @test_load_v8i32_noalign(<8 x i32>* %p1) {
+    %r = load <8 x i32>, <8 x i32>* %p1, align 1
+    ret <8 x i32> %r
+  }
+
+  define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) {
+    store <8 x i32> %val, <8 x i32>* %p1, align 1
+    ret void
+  }
+
 ---
 name:            test_mul_vec256
 alignment:       4
@@ -84,3 +93,47 @@ body:             |
     RET 0
 
 ...
+---
+name:            test_load_v8i32_noalign
+# CHECK-LABEL: name:  test_load_v8i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: false
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: gpr }
+# CHECK-NEXT:    - { id: 1, class: vecr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<8 x s32>) = G_LOAD %0(p0) :: (load 32 from %ir.p1, align 1)
+    %ymm0 = COPY %1(<8 x s32>)
+    RET 0, implicit %ymm0
+
+...
+---
+name:            test_store_v8i32_noalign
+# CHECK-LABEL: name:  test_store_v8i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: false
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: vecr }
+# CHECK-NEXT:    - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %ymm0
+
+    %0(<8 x s32>) = COPY %ymm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<8 x s32>), %1(p0) :: (store 32 into %ir.p1, align 1)
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir b/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir
index e0c12ff44a2f..278413ad38ef 100644
--- a/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir
+++ b/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir
@@ -15,22 +15,29 @@
     ret void
   }
 
+  define <16 x i32> @test_load_v16i32_noalign(<16 x i32>* %p1) {
+    %r = load <16 x i32>, <16 x i32>* %p1, align 1
+    ret <16 x i32> %r
+  }
+
+  define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) {
+    store <16 x i32> %val, <16 x i32>* %p1, align 1
+    ret void
+  }
+
 ...
 ---
 name:            test_mul_vec512
+# CHECK-LABEL: name:  test_mul_vec512
 alignment:       4
 legalized:       true
 regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-# CHECK-LABEL: name:            test_mul_vec512
-# CHECK: registers:
-# CHECK:  - { id: 0, class: vecr }
-# CHECK:  - { id: 1, class: vecr }
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: vecr }
+# CHECK-NEXT:    - { id: 1, class: vecr }
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body:             |
   bb.1 (%ir-block.0):
 
@@ -41,19 +48,16 @@ body:             |
 ...
 ---
 name:            test_add_vec512
+# CHECK-LABEL: name:  test_add_vec512
 alignment:       4
 legalized:       true
 regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-# CHECK-LABEL: name:            test_add_vec512
-# CHECK: registers:
-# CHECK:  - { id: 0, class: vecr }
-# CHECK:  - { id: 1, class: vecr }
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: vecr }
+# CHECK-NEXT:    - { id: 1, class: vecr }
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body:             |
   bb.1 (%ir-block.0):
 
@@ -64,24 +68,65 @@ body:             |
 ...
 ---
 name:            test_sub_vec512
+# CHECK-LABEL: name:  test_sub_vec512
 alignment:       4
 legalized:       true
 regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-# CHECK-LABEL: name:            test_sub_vec512
-# CHECK: registers:
-# CHECK:  - { id: 0, class: vecr }
-# CHECK:  - { id: 1, class: vecr }
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: vecr }
+# CHECK-NEXT:    - { id: 1, class: vecr }
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body:             |
   bb.1 (%ir-block.0):
 
     %0(<16 x s32>) = IMPLICIT_DEF
     %1(<16 x s32>) = G_SUB %0, %0
     RET 0
+...
+---
+
+name:            test_load_v16i32_noalign
+# CHECK-LABEL: name:  test_load_v16i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: false
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: gpr }
+# CHECK-NEXT:    - { id: 1, class: vecr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 1)
+    %zmm0 = COPY %1(<16 x s32>)
+    RET 0, implicit %zmm0
+
+...
+---
+name:            test_store_v16i32_noalign
+# CHECK-LABEL: name:  test_store_v16i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: false
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: vecr }
+# CHECK-NEXT:    - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %zmm0
+
+    %0(<16 x s32>) = COPY %zmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 1)
+    RET 0
 
 ...
diff --git a/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir b/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir
new file mode 100644
index 000000000000..539520c0b8f5
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir
@@ -0,0 +1,96 @@
+# RUN: llc -mtriple=i586-linux-gnu -mcpu=haswell -mattr=-slow-incdec -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK
+#
+# This is necessary to test that attribute-based rule predicates work and that
+# they properly reset between functions.
+
+--- |
+  define i32 @const_i32_1() {
+    ret i32 1
+  }
+
+  define i32 @const_i32_1_optsize() #0 {
+    ret i32 1
+  }
+
+  define i32 @const_i32_1b() {
+    ret i32 1
+  }
+
+  define i32 @const_i32_1_optsizeb() #0 {
+    ret i32 1
+  }
+
+  attributes #0 = { optsize }
+...
+---
+name:            const_i32_1
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name: const_i32_1
+# CHECK:       registers:
+# CHECK-NEXT:  - { id: 0, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV32ri 1
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s32) = G_CONSTANT i32 1
+    %eax = COPY %0(s32)
+    RET 0, implicit %eax
+...
+---
+name:            const_i32_1_optsize
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name: const_i32_1_optsize
+# CHECK:       registers:
+# CHECK-NEXT:  - { id: 0, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV32r1
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s32) = G_CONSTANT i32 1
+    %eax = COPY %0(s32)
+    RET 0, implicit %eax
+...
+---
+name:            const_i32_1b
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name: const_i32_1b
+# CHECK:       registers:
+# CHECK-NEXT:  - { id: 0, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV32ri 1
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s32) = G_CONSTANT i32 1
+    %eax = COPY %0(s32)
+    RET 0, implicit %eax
+...
+---
+name:            const_i32_1_optsizeb
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name: const_i32_1_optsizeb
+# CHECK:       registers:
+# CHECK-NEXT:  - { id: 0, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV32r1
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s32) = G_CONSTANT i32 1
+    %eax = COPY %0(s32)
+    RET 0, implicit %eax
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v256.mir b/test/CodeGen/X86/GlobalISel/select-memop-v256.mir
new file mode 100644
index 000000000000..b9a7e4a8cc4a
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-memop-v256.mir
@@ -0,0 +1,188 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                      -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f                  -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL
+
+
+--- |
+  define <8 x i32> @test_load_v8i32_noalign(<8 x i32>* %p1) {
+    %r = load <8 x i32>, <8 x i32>* %p1, align 1
+    ret <8 x i32> %r
+  }
+
+  define <8 x i32> @test_load_v8i32_align(<8 x i32>* %p1) {
+    %r = load <8 x i32>, <8 x i32>* %p1, align 32
+    ret <8 x i32> %r
+  }
+
+  define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) {
+    store <8 x i32> %val, <8 x i32>* %p1, align 1
+    ret void
+  }
+
+  define void @test_store_v8i32_align(<8 x i32> %val, <8 x i32>* %p1) {
+    store <8 x i32> %val, <8 x i32>* %p1, align 32
+    ret void
+  }
+
+
+...
+---
+name:            test_load_v8i32_noalign
+# ALL-LABEL: name:  test_load_v8i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NO_AVX512F:       registers:
+# NO_AVX512F-NEXT:    - { id: 0, class: gr64 }
+# NO_AVX512F-NEXT:    - { id: 1, class: vr256 }
+#
+# AVX512ALL:        registers:
+# AVX512ALL-NEXT:     - { id: 0, class: gr64 }
+# AVX512ALL-NEXT:     - { id: 1, class: vr256x }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# NO_AVX512F:           %0 = COPY %rdi
+# NO_AVX512F-NEXT:      %1 = VMOVUPSYrm %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1)
+# NO_AVX512F-NEXT:      %ymm0 = COPY %1
+# NO_AVX512F-NEXT:      RET 0, implicit %ymm0
+#
+# AVX512F:              %0 = COPY %rdi
+# AVX512F-NEXT:         %1 = VMOVUPSZ256rm_NOVLX %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1)
+# AVX512F-NEXT:         %ymm0 = COPY %1
+# AVX512F-NEXT:         RET 0, implicit %ymm0
+#
+# AVX512VL:             %0 = COPY %rdi
+# AVX512VL-NEXT:        %1 = VMOVUPSZ256rm %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1)
+# AVX512VL-NEXT:        %ymm0 = COPY %1
+# AVX512VL-NEXT:        RET 0, implicit %ymm0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<8 x s32>) = G_LOAD %0(p0) :: (load 32 from %ir.p1, align 1)
+    %ymm0 = COPY %1(<8 x s32>)
+    RET 0, implicit %ymm0
+
+...
+---
+name:            test_load_v8i32_align
+# ALL-LABEL: name:  test_load_v8i32_align
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NO_AVX512F:       registers:
+# NO_AVX512F-NEXT:    - { id: 0, class: gr64 }
+# NO_AVX512F-NEXT:    - { id: 1, class: vr256 }
+#
+# AVX512ALL:        registers:
+# AVX512ALL-NEXT:     - { id: 0, class: gr64 }
+# AVX512ALL-NEXT:     - { id: 1, class: vr256x }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# NO_AVX512F:           %0 = COPY %rdi
+# NO_AVX512F-NEXT:      %1 = VMOVAPSYrm %0, 1, _, 0, _ :: (load 32 from %ir.p1)
+# NO_AVX512F-NEXT:      %ymm0 = COPY %1
+# NO_AVX512F-NEXT:      RET 0, implicit %ymm0
+#
+# AVX512F:              %0 = COPY %rdi
+# AVX512F-NEXT:         %1 = VMOVAPSZ256rm_NOVLX %0, 1, _, 0, _ :: (load 32 from %ir.p1)
+# AVX512F-NEXT:         %ymm0 = COPY %1
+# AVX512F-NEXT:         RET 0, implicit %ymm0
+#
+# AVX512VL:             %0 = COPY %rdi
+# AVX512VL-NEXT:        %1 = VMOVAPSZ256rm %0, 1, _, 0, _ :: (load 32 from %ir.p1)
+# AVX512VL-NEXT:        %ymm0 = COPY %1
+# AVX512VL-NEXT:        RET 0, implicit %ymm0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<8 x s32>) = G_LOAD %0(p0) :: (load 32 from %ir.p1)
+    %ymm0 = COPY %1(<8 x s32>)
+    RET 0, implicit %ymm0
+
+...
+---
+name:            test_store_v8i32_noalign
+# ALL-LABEL: name:  test_store_v8i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NO_AVX512F:       registers:
+# NO_AVX512F-NEXT:    - { id: 0, class: vr256 }
+# NO_AVX512F-NEXT:    - { id: 1, class: gr64 }
+#
+# AVX512ALL:        registers:
+# AVX512ALL-NEXT:     - { id: 0, class: vr256x }
+# AVX512ALL-NEXT:     - { id: 1, class: gr64 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# NO_AVX512F:           %0 = COPY %ymm0
+# NO_AVX512F-NEXT:      %1 = COPY %rdi
+# NO_AVX512F-NEXT:      VMOVUPSYmr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1)
+# NO_AVX512F-NEXT:      RET 0
+#
+# AVX512F:              %0 = COPY %ymm0
+# AVX512F-NEXT:         %1 = COPY %rdi
+# AVX512F-NEXT:         VMOVUPSZ256mr_NOVLX %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1)
+# AVX512F-NEXT:         RET 0
+#
+# AVX512VL:             %0 = COPY %ymm0
+# AVX512VL-NEXT:        %1 = COPY %rdi
+# AVX512VL-NEXT:        VMOVUPSZ256mr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1)
+# AVX512VL-NEXT:        RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %ymm0
+
+    %0(<8 x s32>) = COPY %ymm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<8 x s32>), %1(p0) :: (store 32 into %ir.p1, align 1)
+    RET 0
+
+...
+---
+name:            test_store_v8i32_align
+# ALL-LABEL: name:  test_store_v8i32_align
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NO_AVX512F:       registers:
+# NO_AVX512F-NEXT:    - { id: 0, class: vr256 }
+# NO_AVX512F-NEXT:    - { id: 1, class: gr64 }
+#
+# AVX512ALL:        registers:
+# AVX512ALL-NEXT:     - { id: 0, class: vr256x }
+# AVX512ALL-NEXT:     - { id: 1, class: gr64 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# NO_AVX512F:           %0 = COPY %ymm0
+# NO_AVX512F-NEXT:      %1 = COPY %rdi
+# NO_AVX512F-NEXT:      VMOVAPSYmr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1)
+# NO_AVX512F-NEXT:      RET 0
+#
+# AVX512F:              %0 = COPY %ymm0
+# AVX512F-NEXT:         %1 = COPY %rdi
+# AVX512F-NEXT:         VMOVAPSZ256mr_NOVLX %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1)
+# AVX512F-NEXT:         RET 0
+#
+# AVX512VL:             %0 = COPY %ymm0
+# AVX512VL-NEXT:        %1 = COPY %rdi
+# AVX512VL-NEXT:        VMOVAPSZ256mr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1)
+# AVX512VL-NEXT:        RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %ymm0
+
+    %0(<8 x s32>) = COPY %ymm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<8 x s32>), %1(p0) :: (store 32 into %ir.p1)
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v512.mir b/test/CodeGen/X86/GlobalISel/select-memop-v512.mir
new file mode 100644
index 000000000000..87978a684d4c
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-memop-v512.mir
@@ -0,0 +1,127 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX512F
+--- |
+  define <16 x i32> @test_load_v16i32_noalign(<16 x i32>* %p1) {
+    %r = load <16 x i32>, <16 x i32>* %p1, align 1
+    ret <16 x i32> %r
+  }
+
+  define <16 x i32> @test_load_v16i32_align(<16 x i32>* %p1) {
+    %r = load <16 x i32>, <16 x i32>* %p1, align 32
+    ret <16 x i32> %r
+  }
+
+  define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) {
+    store <16 x i32> %val, <16 x i32>* %p1, align 1
+    ret void
+  }
+
+  define void @test_store_v16i32_align(<16 x i32> %val, <16 x i32>* %p1) {
+    store <16 x i32> %val, <16 x i32>* %p1, align 32
+    ret void
+  }
+
+...
+---
+name:            test_load_v16i32_noalign
+# AVX512F-LABEL: name:  test_load_v16i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX512F:      registers:
+# AVX512F-NEXT:   - { id: 0, class: gr64 }
+# AVX512F-NEXT:   - { id: 1, class: vr512 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# AVX512F:          %0 = COPY %rdi
+# AVX512F-NEXT:     %1 = VMOVUPSZrm %0, 1, _, 0, _ :: (load 64 from %ir.p1, align 1)
+# AVX512F-NEXT:     %zmm0 = COPY %1
+# AVX512F-NEXT:     RET 0, implicit %zmm0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 1)
+    %zmm0 = COPY %1(<16 x s32>)
+    RET 0, implicit %zmm0
+
+...
+---
+name:            test_load_v16i32_align
+# AVX512F-LABEL: name:  test_load_v16i32_align
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX512F:      registers:
+# AVX512F-NEXT:   - { id: 0, class: gr64 }
+# AVX512F-NEXT:   - { id: 1, class: vr512 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# AVX512F:          %0 = COPY %rdi
+# AVX512F-NEXT:     %1 = VMOVUPSZrm %0, 1, _, 0, _ :: (load 64 from %ir.p1, align 32)
+# AVX512F-NEXT:     %zmm0 = COPY %1
+# AVX512F-NEXT:     RET 0, implicit %zmm0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 32)
+    %zmm0 = COPY %1(<16 x s32>)
+    RET 0, implicit %zmm0
+
+...
+---
+name:            test_store_v16i32_noalign
+# AVX512F-LABEL: name:  test_store_v16i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX512F:      registers:
+# AVX512F-NEXT:   - { id: 0, class: vr512 }
+# AVX512F-NEXT:   - { id: 1, class: gr64 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# AVX512F:          %0 = COPY %zmm0
+# AVX512F-NEXT:     %1 = COPY %rdi
+# AVX512F-NEXT:     VMOVUPSZmr %1, 1, _, 0, _, %0 :: (store 64 into %ir.p1, align 1)
+# AVX512F-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %zmm0
+
+    %0(<16 x s32>) = COPY %zmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 1)
+    RET 0
+
+...
+---
+name:            test_store_v16i32_align
+# AVX512F-LABEL: name:  test_store_v16i32_align
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX512F:      registers:
+# AVX512F-NEXT:   - { id: 0, class: vr512 }
+# AVX512F-NEXT:   - { id: 1, class: gr64 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# AVX512F:          %0 = COPY %zmm0
+# AVX512F-NEXT:     %1 = COPY %rdi
+# AVX512F-NEXT:     VMOVUPSZmr %1, 1, _, 0, _, %0 :: (store 64 into %ir.p1, align 32)
+# AVX512F-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %zmm0
+
+    %0(<16 x s32>) = COPY %zmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 32)
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index cf514d7aeb31..016ddb9c5e78 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -1,10 +1,8 @@
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck --check-prefix=FAST-YMM-ZMM %s
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s
-
-; FAST-YMM-ZMM-NOT: vzeroupper
-; BTVER2-NOT: vzeroupper
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-YMM-ZMM
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2
 
 declare i32 @foo()
 declare <4 x float> @do_sse(<4 x float>)
@@ -15,43 +13,86 @@ declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind
 
 ;; Basic checking - don't emit any vzeroupper instruction
 
-; CHECK: _test00
-define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
-entry:
-  ; CHECK-NOT: vzeroupper
+define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind {
+; ALL-LABEL: test00:
+; ALL:       # BB#0:
+; ALL-NEXT:    pushq %rax
+; ALL-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    callq do_sse
+; ALL-NEXT:    popq %rax
+; ALL-NEXT:    retq
   %add.i = fadd <4 x float> %a, %b
   %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind
-  ; CHECK: ret
   ret <4 x float> %call3
 }
 
 ;; Check parameter 256-bit parameter passing
 
-; CHECK: _test01
-define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp {
-entry:
+define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind {
+; VZ-LABEL: test01:
+; VZ:       # BB#0:
+; VZ-NEXT:    subq $56, %rsp
+; VZ-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; VZ-NEXT:    vmovaps {{.*}}(%rip), %xmm0
+; VZ-NEXT:    vzeroupper
+; VZ-NEXT:    callq do_sse
+; VZ-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; VZ-NEXT:    callq do_sse
+; VZ-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; VZ-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; VZ-NEXT:    addq $56, %rsp
+; VZ-NEXT:    retq
+;
+; FAST-YMM-ZMM-LABEL: test01:
+; FAST-YMM-ZMM:       # BB#0:
+; FAST-YMM-ZMM-NEXT:    subq $56, %rsp
+; FAST-YMM-ZMM-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; FAST-YMM-ZMM-NEXT:    vmovaps {{.*}}(%rip), %xmm0
+; FAST-YMM-ZMM-NEXT:    callq do_sse
+; FAST-YMM-ZMM-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; FAST-YMM-ZMM-NEXT:    callq do_sse
+; FAST-YMM-ZMM-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; FAST-YMM-ZMM-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; FAST-YMM-ZMM-NEXT:    addq $56, %rsp
+; FAST-YMM-ZMM-NEXT:    retq
+;
+; BTVER2-LABEL: test01:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    subq $56, %rsp
+; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %xmm0
+; BTVER2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; BTVER2-NEXT:    callq do_sse
+; BTVER2-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; BTVER2-NEXT:    callq do_sse
+; BTVER2-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; BTVER2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; BTVER2-NEXT:    addq $56, %rsp
+; BTVER2-NEXT:    retq
   %tmp = load <4 x float>, <4 x float>* @x, align 16
-  ; CHECK: vzeroupper
-  ; CHECK-NEXT: callq _do_sse
   %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
   store <4 x float> %call, <4 x float>* @x, align 16
-  ; CHECK-NOT: vzeroupper
-  ; CHECK: callq _do_sse
   %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind
   store <4 x float> %call2, <4 x float>* @x, align 16
-  ; CHECK: ret
   ret <8 x float> %c
 }
 
 ;; Check that vzeroupper is emitted for tail calls.
 
-; CHECK: _test02
-define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind uwtable ssp {
-entry:
+define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind {
+; VZ-LABEL: test02:
+; VZ:       # BB#0:
+; VZ-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; VZ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; VZ-NEXT:    vzeroupper
+; VZ-NEXT:    jmp do_sse # TAILCALL
+;
+; NO-VZ-LABEL: test02:
+; NO-VZ:       # BB#0:
+; NO-VZ-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; NO-VZ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NO-VZ-NEXT:    jmp do_sse # TAILCALL
   %add.i = fadd <8 x float> %a, %b
   %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
-  ; CHECK: vzeroupper
-  ; CHECK: jmp _do_sse
   %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind
   ret <4 x float> %call3
 }
@@ -59,30 +100,113 @@ entry:
 ;; Test the pass convergence and also that vzeroupper is only issued when necessary,
 ;; for this function it should be only once
 
-; CHECK: _test03
-define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
+define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
+; VZ-LABEL: test03:
+; VZ:       # BB#0: # %entry
+; VZ-NEXT:    pushq %rbx
+; VZ-NEXT:    subq $16, %rsp
+; VZ-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; VZ-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; VZ-NEXT:    .p2align 4, 0x90
+; VZ-NEXT:  .LBB3_1: # %while.cond
+; VZ-NEXT:    # =>This Inner Loop Header: Depth=1
+; VZ-NEXT:    callq foo
+; VZ-NEXT:    testl %eax, %eax
+; VZ-NEXT:    jne .LBB3_1
+; VZ-NEXT:  # BB#2: # %for.body.preheader
+; VZ-NEXT:    movl $4, %ebx
+; VZ-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; VZ-NEXT:    .p2align 4, 0x90
+; VZ-NEXT:  .LBB3_3: # %for.body
+; VZ-NEXT:    # =>This Inner Loop Header: Depth=1
+; VZ-NEXT:    callq do_sse
+; VZ-NEXT:    callq do_sse
+; VZ-NEXT:    vmovaps {{.*}}(%rip), %ymm0
+; VZ-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; VZ-NEXT:    vzeroupper
+; VZ-NEXT:    callq do_sse
+; VZ-NEXT:    decl %ebx
+; VZ-NEXT:    jne .LBB3_3
+; VZ-NEXT:  # BB#4: # %for.end
+; VZ-NEXT:    addq $16, %rsp
+; VZ-NEXT:    popq %rbx
+; VZ-NEXT:    retq
+;
+; FAST-YMM-ZMM-LABEL: test03:
+; FAST-YMM-ZMM:       # BB#0: # %entry
+; FAST-YMM-ZMM-NEXT:    pushq %rbx
+; FAST-YMM-ZMM-NEXT:    subq $16, %rsp
+; FAST-YMM-ZMM-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; FAST-YMM-ZMM-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; FAST-YMM-ZMM-NEXT:    .p2align 4, 0x90
+; FAST-YMM-ZMM-NEXT:  .LBB3_1: # %while.cond
+; FAST-YMM-ZMM-NEXT:    # =>This Inner Loop Header: Depth=1
+; FAST-YMM-ZMM-NEXT:    callq foo
+; FAST-YMM-ZMM-NEXT:    testl %eax, %eax
+; FAST-YMM-ZMM-NEXT:    jne .LBB3_1
+; FAST-YMM-ZMM-NEXT:  # BB#2: # %for.body.preheader
+; FAST-YMM-ZMM-NEXT:    movl $4, %ebx
+; FAST-YMM-ZMM-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; FAST-YMM-ZMM-NEXT:    .p2align 4, 0x90
+; FAST-YMM-ZMM-NEXT:  .LBB3_3: # %for.body
+; FAST-YMM-ZMM-NEXT:    # =>This Inner Loop Header: Depth=1
+; FAST-YMM-ZMM-NEXT:    callq do_sse
+; FAST-YMM-ZMM-NEXT:    callq do_sse
+; FAST-YMM-ZMM-NEXT:    vmovaps {{.*}}(%rip), %ymm0
+; FAST-YMM-ZMM-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; FAST-YMM-ZMM-NEXT:    callq do_sse
+; FAST-YMM-ZMM-NEXT:    decl %ebx
+; FAST-YMM-ZMM-NEXT:    jne .LBB3_3
+; FAST-YMM-ZMM-NEXT:  # BB#4: # %for.end
+; FAST-YMM-ZMM-NEXT:    addq $16, %rsp
+; FAST-YMM-ZMM-NEXT:    popq %rbx
+; FAST-YMM-ZMM-NEXT:    retq
+;
+; BTVER2-LABEL: test03:
+; BTVER2:       # BB#0: # %entry
+; BTVER2-NEXT:    pushq %rbx
+; BTVER2-NEXT:    subq $16, %rsp
+; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; BTVER2-NEXT:    .p2align 4, 0x90
+; BTVER2-NEXT:  .LBB3_1: # %while.cond
+; BTVER2-NEXT:    # =>This Inner Loop Header: Depth=1
+; BTVER2-NEXT:    callq foo
+; BTVER2-NEXT:    testl %eax, %eax
+; BTVER2-NEXT:    jne .LBB3_1
+; BTVER2-NEXT:  # BB#2: # %for.body.preheader
+; BTVER2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; BTVER2-NEXT:    movl $4, %ebx
+; BTVER2-NEXT:    .p2align 4, 0x90
+; BTVER2-NEXT:  .LBB3_3: # %for.body
+; BTVER2-NEXT:    # =>This Inner Loop Header: Depth=1
+; BTVER2-NEXT:    callq do_sse
+; BTVER2-NEXT:    callq do_sse
+; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %ymm0
+; BTVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; BTVER2-NEXT:    callq do_sse
+; BTVER2-NEXT:    decl %ebx
+; BTVER2-NEXT:    jne .LBB3_3
+; BTVER2-NEXT:  # BB#4: # %for.end
+; BTVER2-NEXT:    addq $16, %rsp
+; BTVER2-NEXT:    popq %rbx
+; BTVER2-NEXT:    retq
 entry:
   %add.i = fadd <4 x float> %a, %b
   br label %while.cond
 
-while.cond: 
+while.cond:
   %call = tail call i32 @foo()
   %tobool = icmp eq i32 %call, 0
   br i1 %tobool, label %for.body, label %while.cond
 
 for.body:
-  ; CHECK: LBB
-  ; CHECK-NOT: vzeroupper
   %i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ]
   %c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ]
-  ; CHECK: callq _do_sse
   %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
-  ; CHECK-NEXT: callq _do_sse
   %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind
   %tmp11 = load <8 x float>, <8 x float>* @g, align 32
   %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind
-  ; CHECK: vzeroupper
-  ; CHECK-NEXT: callq _do_sse
   %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind
   %1 = add nsw i32 %i.018, 1
   %exitcond = icmp eq i32 %1, 4
@@ -94,15 +218,30 @@ for.end:
 
 ;; Check that we also perform vzeroupper when we return from a function.
 
-; CHECK: _test04
-define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
-entry:
+define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind {
+; VZ-LABEL: test04:
+; VZ:       # BB#0:
+; VZ-NEXT:    pushq %rax
+; VZ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; VZ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; VZ-NEXT:    callq do_avx
+; VZ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; VZ-NEXT:    popq %rax
+; VZ-NEXT:    vzeroupper
+; VZ-NEXT:    retq
+;
+; NO-VZ-LABEL: test04:
+; NO-VZ:       # BB#0:
+; NO-VZ-NEXT:    pushq %rax
+; NO-VZ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; NO-VZ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NO-VZ-NEXT:    callq do_avx
+; NO-VZ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NO-VZ-NEXT:    popq %rax
+; NO-VZ-NEXT:    retq
   %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ; CHECK-NOT: vzeroupper
-  ; CHECK: call
   %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
   %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ; CHECK: vzeroupper
-  ; CHECK: ret
   ret <4 x float> %shuf2
 }
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 3337f42eb142..51f9a382ccbf 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2216,9 +2216,9 @@ define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
 ; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    kunpckwd %k1, %k0, %k0
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckwd %k0, %k1, %k0
 ; AVX512F-32-NEXT:    kmovd %k0, %eax
 ; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
diff --git a/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll b/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll
new file mode 100644
index 000000000000..019c5282f63b
--- /dev/null
+++ b/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq --show-mc-encoding | FileCheck %s --check-prefix=X86_64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vpopcntdq --show-mc-encoding | FileCheck %s --check-prefix=X86
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; The following tests check that patterns that includes      ;;
+;; ctpop intrinsic + select are translated to the vpopcntd/q  ;;
+;; instruction in a correct way.                              ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <16 x i32> @test_mask_vpopcnt_d(<16 x i32> %a, i16 %mask, <16 x i32> %b) {
+; X86_64-LABEL: test_mask_vpopcnt_d:
+; X86_64:       # BB#0:
+; X86_64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X86_64-NEXT:    vpopcntd %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x55,0xc1]
+; X86_64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mask_vpopcnt_d:
+; X86:       # BB#0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpopcntd %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x55,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %a
+  ret <16 x i32> %3
+}
+
+define <16 x i32> @test_maskz_vpopcnt_d(i16 %mask, <16 x i32> %a) {
+; X86_64-LABEL: test_maskz_vpopcnt_d:
+; X86_64:       # BB#0:
+; X86_64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X86_64-NEXT:    vpopcntd %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x55,0xc0]
+; X86_64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_maskz_vpopcnt_d:
+; X86:       # BB#0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpopcntd %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x55,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
+  ret <16 x i32> %3
+}
+
+define <8 x i64> @test_mask_vpopcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; X86_64-LABEL: test_mask_vpopcnt_q:
+; X86_64:       # BB#0:
+; X86_64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X86_64-NEXT:    vpopcntq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x55,0xc8]
+; X86_64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86_64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mask_vpopcnt_q:
+; X86:       # BB#0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpopcntq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x55,0xc8]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %b
+  ret <8 x i64> %3
+}
+
+define <8 x i64> @test_maskz_vpopcnt_q(<8 x i64> %a, i8 %mask) {
+; X86_64-LABEL: test_maskz_vpopcnt_q:
+; X86_64:       # BB#0:
+; X86_64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X86_64-NEXT:    vpopcntq %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x55,0xc0]
+; X86_64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_maskz_vpopcnt_q:
+; X86:       # BB#0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpopcntq %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x55,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+  ret <8 x i64> %3
+}
+
+declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>)
+declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>)
diff --git a/test/CodeGen/X86/fast-isel-select-cmp.ll b/test/CodeGen/X86/fast-isel-select-cmp.ll
index 1af30e9f32fe..4a8e8792f98d 100644
--- a/test/CodeGen/X86/fast-isel-select-cmp.ll
+++ b/test/CodeGen/X86/fast-isel-select-cmp.ll
@@ -4,9 +4,9 @@
 ; different basic blocks.
 
 define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: select_cmp_cmov_i32
+; CHECK-LABEL: select_cmp_cmov_i32:
 ; CHECK-LABEL: continue
-; CHECK-NOT:   cmp
+; CHECK-NOT:   cmp{{[^_]}}
   %1 = icmp ult i32 %a, %b
   br i1 %1, label %continue, label %exit
 
@@ -19,9 +19,9 @@ exit:
 }
 
 define float @select_fcmp_oeq_f32(float %a, float %b, float %c, float %d) {
-; CHECK-LABEL: select_fcmp_oeq_f32
+; CHECK-LABEL: select_fcmp_oeq_f32:
 ; CHECK-LABEL: continue
-; CHECK-NOT:   cmp
+; CHECK-NOT:   cmp{{[^_]}}
   %1 = fcmp oeq float %a, %b
   br i1 %1, label %continue, label %exit
 
@@ -34,7 +34,7 @@ exit:
 }
 
 define float @select_fcmp_one_f32(float %a, float %b, float %c, float %d) {
-; CHECK-LABEL: select_fcmp_one_f32
+; CHECK-LABEL: select_fcmp_one_f32:
 ; CHECK-LABEL: continue
 ; CHECK-NOT:   ucomi
   %1 = fcmp one float %a, %b
diff --git a/test/CodeGen/X86/fp-intrinsics.ll b/test/CodeGen/X86/fp-intrinsics.ll
index 88aef6bb0659..0f8d730d7535 100644
--- a/test/CodeGen/X86/fp-intrinsics.ll
+++ b/test/CodeGen/X86/fp-intrinsics.ll
@@ -103,9 +103,156 @@ if.end:
   ret double %a.0
 }
 
+; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f5
+; CHECK:  sqrtsd
+define double @f5() {
+entry:
+  %result = call double @llvm.experimental.constrained.sqrt.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that pow(42.1, 3.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f6
+; CHECK:  pow
+define double @f6() {
+entry:
+  %result = call double @llvm.experimental.constrained.pow.f64(double 42.1,
+                                               double 3.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that powi(42.1, 3) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f7
+; CHECK:  powi
+define double @f7() {
+entry:
+  %result = call double @llvm.experimental.constrained.powi.f64(double 42.1,
+                                               i32 3,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that sin(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f8
+; CHECK:  sin
+define double @f8() {
+entry:
+  %result = call double @llvm.experimental.constrained.sin.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that cos(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f9
+; CHECK:  cos
+define double @f9() {
+entry:
+  %result = call double @llvm.experimental.constrained.cos.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that exp(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f10
+; CHECK:  exp
+define double @f10() {
+entry:
+  %result = call double @llvm.experimental.constrained.exp.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that exp2(42.1) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f11
+; CHECK:  exp2
+define double @f11() {
+entry:
+  %result = call double @llvm.experimental.constrained.exp2.f64(double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that log(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f12
+; CHECK:  log
+define double @f12() {
+entry:
+  %result = call double @llvm.experimental.constrained.log.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that log10(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f13
+; CHECK:  log10
+define double @f13() {
+entry:
+  %result = call double @llvm.experimental.constrained.log10.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that log2(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f14
+; CHECK:  log2
+define double @f14() {
+entry:
+  %result = call double @llvm.experimental.constrained.log2.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that rint(42.1) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f15
+; CHECK:  rint
+define double @f15() {
+entry:
+  %result = call double @llvm.experimental.constrained.rint.f64(double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that nearbyint(42.1) isn't simplified when the rounding mode is
+; unknown.
+; CHECK-LABEL: f16
+; CHECK:  nearbyint
+define double @f16() {
+entry:
+  %result = call double @llvm.experimental.constrained.nearbyint.f64(
+                                               double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
 
 @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata"
 declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata)
+declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.exp.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.exp2.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log10.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata)
diff --git a/test/CodeGen/X86/hoist-invariant-load.ll b/test/CodeGen/X86/hoist-invariant-load.ll
index 5ade5b470b54..e7929c9cecdc 100644
--- a/test/CodeGen/X86/hoist-invariant-load.ll
+++ b/test/CodeGen/X86/hoist-invariant-load.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc -mcpu=haswell < %s -stats -O2 2>&1 | grep "4 machine-licm.*hoisted"
+; RUN: llc -mcpu=haswell < %s -stats -O2 2>&1 | grep "4 machinelicm.*hoisted"
 ; For test:
 ; 2 invariant loads, 1 for OBJC_SELECTOR_REFERENCES_
 ; and 1 for objc_msgSend from the GOT
diff --git a/test/CodeGen/X86/misched-copy.ll b/test/CodeGen/X86/misched-copy.ll
index 3e3729285d27..7abd157f147a 100644
--- a/test/CodeGen/X86/misched-copy.ll
+++ b/test/CodeGen/X86/misched-copy.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -verify-machineinstrs -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
 ;
 ; Test scheduling of copy instructions.
 ;
diff --git a/test/CodeGen/X86/or-branch.ll b/test/CodeGen/X86/or-branch.ll
index 4899a0fc7e88..71d7746642e9 100644
--- a/test/CodeGen/X86/or-branch.ll
+++ b/test/CodeGen/X86/or-branch.ll
@@ -1,16 +1,34 @@
-; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=0 | FileCheck %s --check-prefix=JUMP2 --check-prefix=CHECK
-; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=1 | FileCheck %s --check-prefix=JUMP1 --check-prefix=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=0 | FileCheck %s --check-prefix=JUMP2
+; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=1 | FileCheck %s --check-prefix=JUMP1
 
 define void @foo(i32 %X, i32 %Y, i32 %Z) nounwind {
 ; JUMP2-LABEL: foo:
-; JUMP2-DAG:     jl
-; JUMP2-DAG:     je
+; JUMP2:       # BB#0: # %entry
+; JUMP2-NEXT:    cmpl $5, {{[0-9]+}}(%esp)
+; JUMP2-NEXT:    jl .LBB0_3
+; JUMP2-NEXT:  # BB#1: # %entry
+; JUMP2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; JUMP2-NEXT:    testl %eax, %eax
+; JUMP2-NEXT:    je .LBB0_3
+; JUMP2-NEXT:  # BB#2: # %UnifiedReturnBlock
+; JUMP2-NEXT:    retl
+; JUMP2-NEXT:  .LBB0_3: # %cond_true
+; JUMP2-NEXT:    jmp bar # TAILCALL
 ;
 ; JUMP1-LABEL: foo:
-; JUMP1-DAG:     sete
-; JUMP1-DAG:     setl
-; JUMP1:         orb
-; JUMP1:         jne
+; JUMP1:       # BB#0: # %entry
+; JUMP1-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; JUMP1-NEXT:    sete %al
+; JUMP1-NEXT:    cmpl $5, {{[0-9]+}}(%esp)
+; JUMP1-NEXT:    setl %cl
+; JUMP1-NEXT:    orb %al, %cl
+; JUMP1-NEXT:    cmpb $1, %cl
+; JUMP1-NEXT:    jne .LBB0_1
+; JUMP1-NEXT:  # BB#2: # %cond_true
+; JUMP1-NEXT:    jmp bar # TAILCALL
+; JUMP1-NEXT:  .LBB0_1: # %UnifiedReturnBlock
+; JUMP1-NEXT:    retl
 entry:
   %tmp1 = icmp eq i32 %X, 0
   %tmp3 = icmp slt i32 %Y, 5
@@ -29,11 +47,33 @@ UnifiedReturnBlock:
 ; regardless of whether they are expensive or not.
 
 define void @unpredictable(i32 %X, i32 %Y, i32 %Z) nounwind {
-; CHECK-LABEL: unpredictable:
-; CHECK-DAG:     sete
-; CHECK-DAG:     setl
-; CHECK:         orb
-; CHECK:         jne
+; JUMP2-LABEL: unpredictable:
+; JUMP2:       # BB#0: # %entry
+; JUMP2-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; JUMP2-NEXT:    sete %al
+; JUMP2-NEXT:    cmpl $5, {{[0-9]+}}(%esp)
+; JUMP2-NEXT:    setl %cl
+; JUMP2-NEXT:    orb %al, %cl
+; JUMP2-NEXT:    cmpb $1, %cl
+; JUMP2-NEXT:    jne .LBB1_1
+; JUMP2-NEXT:  # BB#2: # %cond_true
+; JUMP2-NEXT:    jmp bar # TAILCALL
+; JUMP2-NEXT:  .LBB1_1: # %UnifiedReturnBlock
+; JUMP2-NEXT:    retl
+;
+; JUMP1-LABEL: unpredictable:
+; JUMP1:       # BB#0: # %entry
+; JUMP1-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; JUMP1-NEXT:    sete %al
+; JUMP1-NEXT:    cmpl $5, {{[0-9]+}}(%esp)
+; JUMP1-NEXT:    setl %cl
+; JUMP1-NEXT:    orb %al, %cl
+; JUMP1-NEXT:    cmpb $1, %cl
+; JUMP1-NEXT:    jne .LBB1_1
+; JUMP1-NEXT:  # BB#2: # %cond_true
+; JUMP1-NEXT:    jmp bar # TAILCALL
+; JUMP1-NEXT:  .LBB1_1: # %UnifiedReturnBlock
+; JUMP1-NEXT:    retl
 entry:
   %tmp1 = icmp eq i32 %X, 0
   %tmp3 = icmp slt i32 %Y, 5
diff --git a/test/CodeGen/X86/pr27681.mir b/test/CodeGen/X86/pr27681.mir
index 002761bc1e68..956df172b253 100644
--- a/test/CodeGen/X86/pr27681.mir
+++ b/test/CodeGen/X86/pr27681.mir
@@ -57,7 +57,7 @@ body:             |
     %cl = SETNEr implicit %eflags
     ; Verify that removal of the %bl antidependence does not use %ch
     ; as a replacement register.
-    ; CHECK: %cl = AND8rr %cl, killed %b
+    ; CHECK: %cl = AND8rr killed %cl, killed %b
     %cl = AND8rr killed %cl, killed %bl, implicit-def dead %eflags
     CMP32ri8 %ebp, -1, implicit-def %eflags
     %edx = MOV32ri 0
diff --git a/test/CodeGen/X86/sandybridge-loads.ll b/test/CodeGen/X86/sandybridge-loads.ll
index 2e31154068fc..8570fe7fe7ba 100644
--- a/test/CodeGen/X86/sandybridge-loads.ll
+++ b/test/CodeGen/X86/sandybridge-loads.ll
@@ -1,13 +1,20 @@
-; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
-
-;CHECK-LABEL: wideloads:
-;CHECK: vmovaps
-;CHECK: vinsertf128
-;CHECK: vmovaps
-;CHECK-NOT: vinsertf128
-;CHECK: ret
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s
 
 define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+; CHECK-LABEL: wideloads:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    vmovaps (%rsi), %ymm1
+; CHECK-NEXT:    vcmpltps %ymm0, %ymm1, %ymm1
+; CHECK-NEXT:    vmovaps (%rdx), %ymm2
+; CHECK-NEXT:    vcmpltps %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    vmovaps %ymm0, (%rax)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %v0 = load <8 x float>, <8 x float>* %a, align 16  ; <---- unaligned!
   %v1 = load <8 x float>, <8 x float>* %b, align 32  ; <---- aligned!
   %m0 = fcmp olt <8 x float> %v1, %v0
@@ -19,17 +26,16 @@ define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
   ret void
 }
 
-; CHECK: widestores
-; loads:
-; CHECK: vmovaps
-; CHECK: vmovaps
-; stores:
-; CHECK: vmovaps
-; CHECK: vextractf128
-; CHECK: vmovaps
-;CHECK: ret
-
 define void @widestores(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+; CHECK-LABEL: widestores:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps (%rdi), %ymm0
+; CHECK-NEXT:    vmovaps (%rsi), %ymm1
+; CHECK-NEXT:    vmovaps %ymm0, (%rsi)
+; CHECK-NEXT:    vextractf128 $1, %ymm1, 16(%rdi)
+; CHECK-NEXT:    vmovaps %xmm1, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %v0 = load <8 x float>, <8 x float>* %a, align 32
   %v1 = load <8 x float>, <8 x float>* %b, align 32
   store <8 x float> %v0, <8 x float>* %b, align 32 ; <--- aligned
diff --git a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
index 383ab21bd404..19305d0dad62 100644
--- a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
@@ -354,9 +354,8 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
 define i32 @test_mm_crc32_u8(i32 %a0, i8 %a1) {
 ; X32-LABEL: test_mm_crc32_u8:
 ; X32:       # BB#0:
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    crc32b %cl, %eax
+; X32-NEXT:    crc32b {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_crc32_u8:
@@ -372,9 +371,8 @@ declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind readnone
 define i32 @test_mm_crc32_u16(i32 %a0, i16 %a1) {
 ; X32-LABEL: test_mm_crc32_u16:
 ; X32:       # BB#0:
-; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    crc32w %cx, %eax
+; X32-NEXT:    crc32w {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_crc32_u16:
diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll
index 72542f499087..a00d47bb13e9 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -1651,26 +1651,9 @@ define <8 x float> @stack_fold_sqrtps_ymm(<8 x float> %a0) {
 }
 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
 
-define double @stack_fold_sqrtsd(double %a0) {
-  ;CHECK-LABEL: stack_fold_sqrtsd
-  ;CHECK:       vsqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call double @llvm.sqrt.f64(double %a0)
-  ret double %2
-}
-declare double @llvm.sqrt.f64(double) nounwind readnone
-
+; TODO stack_fold_sqrtsd
 ; TODO stack_fold_sqrtsd_int
-
-define float @stack_fold_sqrtss(float %a0) {
-  ;CHECK-LABEL: stack_fold_sqrtss
-  ;CHECK:       vsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call float @llvm.sqrt.f32(float %a0)
-  ret float %2
-}
-declare float @llvm.sqrt.f32(float) nounwind readnone
-
+; TODO stack_fold_sqrtss
 ; TODO stack_fold_sqrtss_int
 
 define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
diff --git a/test/CodeGen/X86/twoaddr-coalesce-2.ll b/test/CodeGen/X86/twoaddr-coalesce-2.ll
index cbcde0655597..9da071f7ede6 100644
--- a/test/CodeGen/X86/twoaddr-coalesce-2.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce-2.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -stats 2>&1 | \
-; RUN:   grep "twoaddrinstr" | grep "Number of instructions aggressively commuted"
+; RUN:   grep "twoaddressinstruction" | grep "Number of instructions aggressively commuted"
 ; rdar://6480363
 
 target triple = "i386-apple-darwin9.6"
diff --git a/test/CodeGen/X86/vector-narrow-binop.ll b/test/CodeGen/X86/vector-narrow-binop.ll
index f737ea2b7fba..4d183f3172b3 100644
--- a/test/CodeGen/X86/vector-narrow-binop.ll
+++ b/test/CodeGen/X86/vector-narrow-binop.ll
@@ -22,17 +22,17 @@ define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d
 ;
 ; AVX1-LABEL: PR32790:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm1
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: PR32790:
@@ -60,46 +60,17 @@ define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d
 define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
 ; SSE-LABEL: do_not_use_256bit_op:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    psubd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: do_not_use_256bit_op:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
-; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: do_not_use_256bit_op:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
-; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: do_not_use_256bit_op:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: do_not_use_256bit_op:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %concat1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %concat2 = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %and = and <8 x i32> %concat1, %concat2
diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll
index 27909c6bb4a0..adda108bdc77 100644
--- a/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/test/CodeGen/X86/vector-popcnt-128.ll
@@ -5,6 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
 
 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64:
@@ -81,19 +82,41 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE41-NEXT:    psadbw %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: testv2i64:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: testv2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv2i64:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    vzeroupper
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in)
   ret <2 x i64> %out
 }
@@ -193,23 +216,49 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE41-NEXT:    packuswb %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: testv4i32:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: testv4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv4i32:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    vzeroupper
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in)
   ret <4 x i32> %out
 }
diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll
index 7a675619d720..accbad35e9d7 100644
--- a/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/test/CodeGen/X86/vector-popcnt-256.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
 
 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64:
@@ -39,6 +40,13 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv4i64:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in)
   ret <4 x i64> %out
 }
@@ -92,6 +100,13 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv8i32:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in)
   ret <8 x i32> %out
 }
@@ -137,6 +152,21 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv16i16:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in)
   ret <16 x i16> %out
 }
@@ -173,6 +203,18 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv32i8:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in)
   ret <32 x i8> %out
 }
diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll
index cf4f21e62b61..aa50206e7a5e 100644
--- a/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/test/CodeGen/X86/vector-popcnt-512.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ
 
 define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512F-LABEL: testv8i64:
@@ -39,6 +40,11 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512BW-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv8i64:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in)
   ret <8 x i64> %out
 }
@@ -92,6 +98,11 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv16i32:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in)
   ret <16 x i32> %out
 }
@@ -135,6 +146,30 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv32i16:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in)
   ret <32 x i16> %out
 }
@@ -169,6 +204,24 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv64i8:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in)
   ret <64 x i8> %out
 }
diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll
index fa3471c2fe40..2e65bd8c75c7 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -282,8 +282,7 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
 define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
 ; ALL-LABEL: shuffle_v16f32_extract_256:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vmovups (%rsi), %zmm0
-; ALL-NEXT:    vextractf32x8 $1, %zmm0, %ymm0
+; ALL-NEXT:    vmovups 32(%rsi), %ymm0
 ; ALL-NEXT:    retq
   %ptr_a = bitcast float* %a to <16 x float>*
   %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4
diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll
index 5aab21749d14..706edd27a3f1 100644
--- a/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -511,11 +511,10 @@ define <8 x float> @expand14(<4 x float> %a) {
 ;
 ; KNL64-LABEL: expand14:
 ; KNL64:       # BB#0:
+; KNL64-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,3,0,0]
+; KNL64-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; KNL64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; KNL64-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
-; KNL64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0]
-; KNL64-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
 ; KNL64-NEXT:    retq
 ;
@@ -529,11 +528,10 @@ define <8 x float> @expand14(<4 x float> %a) {
 ;
 ; KNL32-LABEL: expand14:
 ; KNL32:       # BB#0:
+; KNL32-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,3,0,0]
+; KNL32-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; KNL32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; KNL32-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
-; KNL32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0]
-; KNL32-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL32-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
 ; KNL32-NEXT:    retl
    %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
@@ -545,39 +543,35 @@ define <8 x float> @expand14(<4 x float> %a) {
 define <8 x float> @expand15(<4 x float> %a) {
 ; SKX64-LABEL: expand15:
 ; SKX64:       # BB#0:
-; SKX64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SKX64-NEXT:    vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>
-; SKX64-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0]
+; SKX64-NEXT:    vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
+; SKX64-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
 ; SKX64-NEXT:    vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
-; SKX64-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm0
+; SKX64-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0
 ; SKX64-NEXT:    retq
 ;
 ; KNL64-LABEL: expand15:
 ; KNL64:       # BB#0:
+; KNL64-NEXT:    vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
+; KNL64-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; KNL64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; KNL64-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
-; KNL64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0]
-; KNL64-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
 ; KNL64-NEXT:    retq
 ;
 ; SKX32-LABEL: expand15:
 ; SKX32:       # BB#0:
-; SKX32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SKX32-NEXT:    vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>
-; SKX32-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0]
+; SKX32-NEXT:    vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
+; SKX32-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
 ; SKX32-NEXT:    vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
-; SKX32-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm0
+; SKX32-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0
 ; SKX32-NEXT:    retl
 ;
 ; KNL32-LABEL: expand15:
 ; KNL32:       # BB#0:
+; KNL32-NEXT:    vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
+; KNL32-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; KNL32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; KNL32-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
-; KNL32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0]
-; KNL32-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL32-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
 ; KNL32-NEXT:    retl
    %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
diff --git a/test/CodeGen/X86/vector-sqrt.ll b/test/CodeGen/X86/vector-sqrt.ll
index c5ac4466b5fa..13088b7fa5f2 100644
--- a/test/CodeGen/X86/vector-sqrt.ll
+++ b/test/CodeGen/X86/vector-sqrt.ll
@@ -5,8 +5,10 @@
 define <2 x double> @sqrtd2(double* nocapture readonly %v) local_unnamed_addr #0 {
 ; CHECK-LABEL: sqrtd2:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vsqrtsd (%rdi), %xmm0, %xmm0
-; CHECK-NEXT:    vsqrtsd 8(%rdi), %xmm1, %xmm1
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
 entry:
@@ -27,10 +29,14 @@ declare double @sqrt(double) local_unnamed_addr #1
 define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 {
 ; CHECK-LABEL: sqrtf4:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vsqrtss (%rdi), %xmm0, %xmm0
-; CHECK-NEXT:    vsqrtss 4(%rdi), %xmm1, %xmm1
-; CHECK-NEXT:    vsqrtss 8(%rdi), %xmm2, %xmm2
-; CHECK-NEXT:    vsqrtss 12(%rdi), %xmm3, %xmm3
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vsqrtss %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vsqrtss %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll
index a5fac9ac6a41..d4fbb72bbe6d 100644
--- a/test/CodeGen/X86/vector-trunc-math.ll
+++ b/test/CodeGen/X86/vector-trunc-math.ll
@@ -3030,10 +3030,10 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: trunc_and_v8i32_v8i16:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    pslld $16, %xmm1
 ; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pslld $16, %xmm0
 ; SSE-NEXT:    psrad $16, %xmm0
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
@@ -3786,10 +3786,10 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: trunc_xor_v8i32_v8i16:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:    pxor %xmm3, %xmm1
 ; SSE-NEXT:    pslld $16, %xmm1
 ; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:    pslld $16, %xmm0
 ; SSE-NEXT:    psrad $16, %xmm0
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
@@ -4542,10 +4542,10 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: trunc_or_v8i32_v8i16:
 ; SSE:       # BB#0:
-; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm3, %xmm1
 ; SSE-NEXT:    pslld $16, %xmm1
 ; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    pslld $16, %xmm0
 ; SSE-NEXT:    psrad $16, %xmm0
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 22d0065b264f..a22a60756264 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -7,6 +7,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
@@ -117,6 +118,17 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv2i64:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    vzeroupper
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -284,6 +296,17 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv2i64u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    vzeroupper
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv2i64u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -501,6 +524,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv4i32:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    vzeroupper
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -700,6 +735,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv4i32u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    vzeroupper
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv4i32u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -843,6 +890,25 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv8i16:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -984,6 +1050,25 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv8i16u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv8i16u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -1106,6 +1191,22 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv16i8:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -1224,6 +1325,22 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv16i8u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv16i8u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -1258,6 +1375,12 @@ define <2 x i64> @foldv2i64() nounwind {
 ; AVX-NEXT:    vmovq %rax, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv2i64:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    movl $8, %eax
+; AVX512VPOPCNTDQ-NEXT:    vmovq %rax, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movl $8, %eax
@@ -1280,6 +1403,12 @@ define <2 x i64> @foldv2i64u() nounwind {
 ; AVX-NEXT:    vmovq %rax, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv2i64u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    movl $8, %eax
+; AVX512VPOPCNTDQ-NEXT:    vmovq %rax, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv2i64u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movl $8, %eax
@@ -1300,6 +1429,11 @@ define <4 x i32> @foldv4i32() nounwind {
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv4i32:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
@@ -1319,6 +1453,11 @@ define <4 x i32> @foldv4i32u() nounwind {
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv4i32u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv4i32u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
@@ -1338,6 +1477,11 @@ define <8 x i16> @foldv8i16() nounwind {
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv8i16:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
@@ -1357,6 +1501,11 @@ define <8 x i16> @foldv8i16u() nounwind {
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv8i16u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv8i16u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
@@ -1376,6 +1525,11 @@ define <16 x i8> @foldv16i8() nounwind {
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv16i8:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
@@ -1395,6 +1549,11 @@ define <16 x i8> @foldv16i8u() nounwind {
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv16i8u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv16i8u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index a0b277ddd732..101ae95550e7 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
@@ -12,11 +13,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -28,6 +26,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
@@ -92,6 +92,17 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv4i64:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv4i64:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -117,11 +128,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -133,6 +141,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
@@ -182,6 +192,17 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv4i64u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv4i64u:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -205,28 +226,27 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
-; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
@@ -234,12 +254,12 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv8i32:
@@ -307,6 +327,17 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv8i32:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv8i32:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -335,28 +366,27 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32u:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
-; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
@@ -364,12 +394,12 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv8i32u:
@@ -414,6 +444,17 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv8i32u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv8i32u:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -442,32 +483,31 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubw %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpsubw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm5
+; AVX1-NEXT:    vpaddb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -532,6 +572,25 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv16i16:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv16i16:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -557,32 +616,31 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16u:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubw %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpsubw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm5
+; AVX1-NEXT:    vpaddb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -647,6 +705,25 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv16i16u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv16i16u:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -674,27 +751,26 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -747,6 +823,22 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv32i8:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv32i8:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -771,27 +863,26 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -844,6 +935,22 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv32i8u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv32i8u:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 2d1715949a5e..abbe964e983c 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=-avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
 
 define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512CD-LABEL: testv8i64:
@@ -64,6 +65,15 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv8i64:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
   ret <8 x i64> %out
 }
@@ -105,6 +115,15 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv8i64u:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
   ret <8 x i64> %out
 }
@@ -186,6 +205,15 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv16i32:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 0)
   ret <16 x i32> %out
 }
@@ -231,6 +259,15 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv16i32u:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 -1)
   ret <16 x i32> %out
 }
@@ -305,6 +342,38 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv32i16:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm0, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0)
   ret <32 x i16> %out
 }
@@ -379,6 +448,38 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv32i16u:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm0, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1)
   ret <32 x i16> %out
 }
@@ -441,6 +542,32 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv64i8:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0)
   ret <64 x i8> %out
 }
@@ -503,6 +630,32 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv64i8u:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1)
   ret <64 x i8> %out
 }
diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll
index fbaf500e8333..b5c7f86567a1 100644
--- a/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/test/CodeGen/X86/wide-integer-cmp.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i686-linux-gnu %s -o - | FileCheck %s
 
-
 define i32 @branch_eq(i64 %a, i64 %b) {
 ; CHECK-LABEL: branch_eq:
 ; CHECK:       # BB#0: # %entry
diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll
index 6b2e4de5cdaa..42c4c23c6349 100644
--- a/test/CodeGen/X86/widened-broadcast.ll
+++ b/test/CodeGen/X86/widened-broadcast.ll
@@ -151,8 +151,7 @@ define <8 x i32> @load_splat_8i32_8i32_01010101(<8 x i32>* %ptr) nounwind uwtabl
 ;
 ; AVX1-LABEL: load_splat_8i32_8i32_01010101:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovapd (%rdi), %ymm0
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -288,8 +287,7 @@ define <16 x i16> @load_splat_16i16_16i16_0101010101010101(<16 x i16>* %ptr) nou
 ;
 ; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -315,22 +313,10 @@ define <16 x i16> @load_splat_16i16_16i16_0123012301230123(<16 x i16>* %ptr) nou
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_16i16_16i16_0123012301230123:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_16i16_16i16_0123012301230123:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %ymm0
-; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_16i16_16i16_0123012301230123:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_16i16_16i16_0123012301230123:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <16 x i16>, <16 x i16>* %ptr
   %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3,i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -513,8 +499,7 @@ define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(<32 x i8
 ;
 ; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -587,26 +572,10 @@ define <4 x float> @load_splat_4f32_8f32_0000(<8 x float>* %ptr) nounwind uwtabl
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_4f32_8f32_0000:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_4f32_8f32_0000:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %ymm0
-; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_4f32_8f32_0000:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_4f32_8f32_0000:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vbroadcastss (%rdi), %xmm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <8 x float>, <8 x float>* %ptr
   %ret = shufflevector <8 x float> %ld, <8 x float> undef, <4 x i32> zeroinitializer
@@ -627,22 +596,10 @@ define <8 x float> @load_splat_8f32_16f32_89898989(<16 x float>* %ptr) nounwind
 ; SSE42-NEXT:    movapd %xmm0, %xmm1
 ; SSE42-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_8f32_16f32_89898989:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vbroadcastsd 32(%rdi), %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_8f32_16f32_89898989:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vbroadcastsd 32(%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_8f32_16f32_89898989:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovapd (%rdi), %zmm0
-; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_8f32_16f32_89898989:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vbroadcastsd 32(%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <16 x float>, <16 x float>* %ptr
   %ret = shufflevector <16 x float> %ld, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 8, i32 9, i32 8, i32 9, i32 8, i32 9>
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index 6fbec91e77a3..f4d0503f4a79 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -57,10 +57,8 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; AVX1-NEXT:    vmovups 96(%rdi), %ymm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX1-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -69,10 +67,8 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovupd (%rdi), %ymm0
 ; AVX2-NEXT:    vmovupd 32(%rdi), %ymm1
-; AVX2-NEXT:    vmovupd 64(%rdi), %ymm2
-; AVX2-NEXT:    vmovupd 96(%rdi), %ymm3
-; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT:    vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/x87.ll b/test/CodeGen/X86/x87.ll
index 683d7b05cf8c..9bc654861b69 100644
--- a/test/CodeGen/X86/x87.ll
+++ b/test/CodeGen/X86/x87.ll
@@ -1,13 +1,16 @@
 ; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X87
 ; RUN: llc < %s -march=x86-64 -mattr=-sse | FileCheck %s -check-prefix=X87
-; RUN: llc < %s -march=x86 -mattr=-x87 | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
-; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
-; RUN: llc < %s -march=x86 -mattr=-x87,+sse | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
-; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse2 | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
+; RUN: llc < %s -march=x86 -mattr=-x87 | FileCheck %s -check-prefix=NOX87
+; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse | FileCheck %s -check-prefix=NOX87
+; RUN: llc < %s -march=x86 -mattr=-x87,+sse | FileCheck %s -check-prefix=NOX87
+; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse2 | FileCheck %s -check-prefix=NOX87
 
 define void @test(i32 %i, i64 %l, float* %pf, double* %pd, fp128* %pld) nounwind readnone {
 ; X87-LABEL: test:
 ; NOX87-LABEL: test:
+
+; NOX87-NOT: {{ }}f{{.*}}
+
 ; X87: fild
 ; NOX87: __floatunsisf
   %tmp = uitofp i32 %i to float
diff --git a/test/CodeGen/XCore/epilogue_prologue.ll b/test/CodeGen/XCore/epilogue_prologue.ll
index aed49f4b67ba..d214c40dd9b9 100644
--- a/test/CodeGen/XCore/epilogue_prologue.ll
+++ b/test/CodeGen/XCore/epilogue_prologue.ll
@@ -6,7 +6,7 @@
 ; When using FP, for large or small frames, we may need one scratch register.
 
 ; FP + small frame: spill FP+SR = entsp 2
-; CHECKFP-LABEL: f1
+; CHECKFP-LABEL: f1:
 ; CHECKFP: entsp 2
 ; CHECKFP-NEXT: stw r10, sp[1]
 ; CHECKFP-NEXT: ldaw r10, sp[0]
@@ -15,7 +15,7 @@
 ; CHECKFP-NEXT: retsp 2
 ;
 ; !FP + small frame: no spills = no stack adjustment needed
-; CHECK-LABEL: f1
+; CHECK-LABEL: f1:
 ; CHECK: stw lr, sp[0]
 ; CHECK: ldw lr, sp[0]
 ; CHECK-NEXT: retsp 0
@@ -27,7 +27,7 @@ entry:
 
 
 ; FP + small frame: spill FP+SR+R0+LR = entsp 3 + extsp 1
-; CHECKFP-LABEL:f3
+; CHECKFP-LABEL: f3:
 ; CHECKFP: entsp 3
 ; CHECKFP-NEXT: stw r10, sp[1]
 ; CHECKFP-NEXT: ldaw r10, sp[0]
@@ -43,7 +43,7 @@ entry:
 ; CHECKFP-NEXT: retsp 3
 ;
 ; !FP + small frame: spill R0+LR = entsp 2
-; CHECK-LABEL: f3
+; CHECK-LABEL: f3:
 ; CHECK: entsp 2
 ; CHECK-NEXT: stw [[REG:r[4-9]+]], sp[1]
 ; CHECK-NEXT: mov [[REG]], r0
@@ -60,7 +60,7 @@ entry:
 
 
 ; FP + large frame: spill FP+SR = entsp 2 + 100000
-; CHECKFP-LABEL: f4
+; CHECKFP-LABEL: f4:
 ; CHECKFP: entsp 65535
 ; CHECKFP-NEXT: .Lcfi{{[0-9]+}}
 ; CHECKFP-NEXT: .cfi_def_cfa_offset 262140
@@ -81,7 +81,7 @@ entry:
 ; CHECKFP-NEXT: retsp 34467
 ;
 ; !FP + large frame: spill SR+SR = entsp 2 + 100000
-; CHECK-LABEL: f4
+; CHECK-LABEL: f4:
 ; CHECK: entsp 65535
 ; CHECK-NEXT: .Lcfi{{[0-9]+}}
 ; CHECK-NEXT: .cfi_def_cfa_offset 262140
@@ -107,7 +107,7 @@ entry:
 ; CHECKFP-NEXT: .LCPI[[CNST1:[0-9_]+]]:
 ; CHECKFP-NEXT: .long 200001
 ; CHECKFP-NEXT: .text
-; CHECKFP-LABEL: f6
+; CHECKFP-LABEL: f6:
 ; CHECKFP: entsp 65535
 ; CHECKFP-NEXT: .Lcfi{{[0-9]+}}
 ; CHECKFP-NEXT: .cfi_def_cfa_offset 262140
@@ -160,7 +160,7 @@ entry:
 ; CHECK-NEXT: .LCPI[[CNST1:[0-9_]+]]:
 ; CHECK-NEXT: .long 200002
 ; CHECK-NEXT: .text
-; CHECK-LABEL: f6
+; CHECK-LABEL: f6:
 ; CHECK: entsp 65535
 ; CHECK-NEXT: .Lcfi{{[0-9]+}}
 ; CHECK-NEXT: .cfi_def_cfa_offset 262140
@@ -207,7 +207,7 @@ entry:
 }
 
 ; FP + large frame: spill FP+SR+LR = entsp 2 + 256  + extsp 1
-; CHECKFP-LABEL:f8
+; CHECKFP-LABEL: f8:
 ; CHECKFP: entsp 258
 ; CHECKFP-NEXT: stw r10, sp[1]
 ; CHECKFP-NEXT: ldaw r10, sp[0]
@@ -221,7 +221,7 @@ entry:
 ; CHECKFP-NEXT: retsp 258
 ;
 ; !FP + large frame: spill SR+SR+LR = entsp 3 + 256
-; CHECK-LABEL:f8
+; CHECK-LABEL: f8:
 ; CHECK: entsp 257
 ; CHECK-NEXT: ldaw r0, sp[254]
 ; CHECK-NEXT: bl f5
@@ -235,7 +235,7 @@ entry:
 }
 
 ; FP + large frame: spill FP+SR+LR = entsp 2 + 32768  + extsp 1
-; CHECKFP-LABEL:f9
+; CHECKFP-LABEL: f9:
 ; CHECKFP: entsp 32770
 ; CHECKFP-NEXT: stw r10, sp[1]
 ; CHECKFP-NEXT: ldaw r10, sp[0]
@@ -249,7 +249,7 @@ entry:
 ; CHECKFP-NEXT: retsp 32770
 ;
 ; !FP + large frame: spill SR+SR+LR = entsp 3 + 32768
-; CHECK-LABEL:f9
+; CHECK-LABEL: f9:
 ; CHECK: entsp 32771
 ; CHECK-NEXT: ldaw r0, sp[32768]
 ; CHECK-NEXT: bl f5
diff --git a/test/DebugInfo/Generic/empty.ll b/test/DebugInfo/Generic/empty.ll
index d5f738fa0271..79912841fa6d 100644
--- a/test/DebugInfo/Generic/empty.ll
+++ b/test/DebugInfo/Generic/empty.ll
@@ -13,10 +13,9 @@
 ; CHECK-NOT: file_names[
 
 ; CHECK: .debug_pubnames contents:
-; CHECK-NOT: Offset
+; CHECK-NOT: {{^}}0x
 
-; CHECK: .debug_pubtypes contents:
-; CHECK-NOT: Offset
+; CHECK: contents:
 
 ; Don't emit DW_AT_addr_base when there are no addresses.
 ; FISSION-NOT: DW_AT_GNU_addr_base [DW_FORM_sec_offset]
@@ -24,8 +23,10 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!5}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: FullDebug, file: !4, enums: !2, retainedTypes: !2, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: FullDebug, file: !4, enums: !2, retainedTypes: !6, globals: !2)
 !2 = !{}
 !3 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
 !4 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
 !5 = !{i32 1, !"Debug Info Version", i32 3}
+!6 = !{!7}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/DebugInfo/Generic/nodebug.ll b/test/DebugInfo/Generic/nodebug.ll
index f85b00bf9f7e..9b0eb9b4dd07 100644
--- a/test/DebugInfo/Generic/nodebug.ll
+++ b/test/DebugInfo/Generic/nodebug.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: object-emission
 
-; RUN: %llc_dwarf < %s -filetype=obj | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; RUN: %llc_dwarf < %s -filetype=obj | llvm-dwarfdump - | FileCheck %s
 
 ; Test that a nodebug function (a function not appearing in the debug info IR
 ; metadata subprogram list) with DebugLocs on its IR doesn't cause crashes/does
@@ -17,9 +17,16 @@
 ; }
 
 ; Check that there's no DW_TAG_subprogram, not even for the 'f2' function.
+; CHECK: .debug_info contents:
 ; CHECK: DW_TAG_compile_unit
 ; CHECK-NOT: DW_TAG_subprogram
 
+; Expect no line table entry since there are no functions and file references in this compile unit
+; CHECK: .debug_line contents:
+; CHECK: Line table prologue:
+; CHECK: total_length: 0x00000019
+; CHECK-NOT: file_names[
+
 @i = external global i32
 
 ; Function Attrs: uwtable
@@ -35,7 +42,7 @@ attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !12, globals: !2, imports: !2)
 !1 = !DIFile(filename: "nodebug.cpp", directory: "/tmp/dbginfo")
 !2 = !{}
 !4 = distinct !DISubprogram(name: "f1", linkageName: "_Z2f1v", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 2, file: !1, scope: !5, type: !6, variables: !2)
@@ -46,3 +53,5 @@ attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="
 !9 = !{i32 2, !"Debug Info Version", i32 3}
 !10 = !{!"clang version 3.5.0 "}
 !11 = !DILocation(line: 3, scope: !4)
+!12 = !{!13}
+!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/DebugInfo/Generic/skeletoncu.ll b/test/DebugInfo/Generic/skeletoncu.ll
index 6d91afd0fa79..b9761b2ab565 100644
--- a/test/DebugInfo/Generic/skeletoncu.ll
+++ b/test/DebugInfo/Generic/skeletoncu.ll
@@ -7,9 +7,11 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, splitDebugFilename: "my.dwo", emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2, dwoId: 43981)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, splitDebugFilename: "my.dwo", emissionKind: FullDebug, enums: !2, retainedTypes: !5, globals: !2, imports: !2, dwoId: 43981)
 !1 = !DIFile(filename: "<stdin>", directory: "/")
 !2 = !{}
 !3 = !{i32 2, !"Dwarf Version", i32 4}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
 
diff --git a/test/DebugInfo/Inputs/split-dwarf-dwp.cpp b/test/DebugInfo/Inputs/split-dwarf-dwp.cpp
new file mode 100644
index 000000000000..b07a1537d6bf
--- /dev/null
+++ b/test/DebugInfo/Inputs/split-dwarf-dwp.cpp
@@ -0,0 +1,12 @@
+void f1();
+__attribute__((always_inline)) void f2() {
+  f1();
+}
+void f3() {
+  f2();
+}
+
+To produce split-dwarf-dwp.o{,dwp}, run:
+
+  $ clang++ split-dwarf-dwp.cpp -gsplit-dwarf -c -Xclang -fdebug-compilation-dir=Output -fno-split-dwarf-inlining
+  $ llvm-dwp split-dwarf-dwp.dwo -o split-dwarf-dwp.o.dwp
diff --git a/test/DebugInfo/Inputs/split-dwarf-dwp.o b/test/DebugInfo/Inputs/split-dwarf-dwp.o
new file mode 100644
index 000000000000..614c62040dec
--- /dev/null
+++ b/test/DebugInfo/Inputs/split-dwarf-dwp.o
diff --git a/test/DebugInfo/Inputs/split-dwarf-dwp.o.dwp b/test/DebugInfo/Inputs/split-dwarf-dwp.o.dwp
new file mode 100644
index 000000000000..16a0af8c062f
--- /dev/null
+++ b/test/DebugInfo/Inputs/split-dwarf-dwp.o.dwp
diff --git a/test/DebugInfo/MIR/X86/empty-inline.mir b/test/DebugInfo/MIR/X86/empty-inline.mir
new file mode 100644
index 000000000000..1766a8f44616
--- /dev/null
+++ b/test/DebugInfo/MIR/X86/empty-inline.mir
@@ -0,0 +1,122 @@
+# RUN: llc  -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+#
+# This testcase has an implicit def pseudo-iunstruction with a debug location.
+#
+# CHECK: .debug_info contents:
+# CHECK: DW_TAG_subprogram
+# CHECK:	DW_AT_low_pc [DW_FORM_addr]	(0x0000000000000000)
+# CHECK-NOT: DW_TAG
+# CHECK:	DW_AT_specification {{.*}} "_ZN1C5m_fn3Ev"
+# CHECK-NOT: DW_TAG
+#	     Here should not be an inlined subroutine with 0 length.
+# CHECK: NULL
+#
+# CHECK: Address            Line   Column File   ISA Discriminator Flags
+# CHECK-NEXT:                ---
+# CHECK-NEXT:                 25      0      1   0             0  is_stmt
+# CHECK-NEXT:                 29     28      1   0             0  is_stmt prologue_end
+# CHECK-NEXT:                 29     28      1   0             0  is_stmt end_sequence
+--- |
+  source_filename = "t.ll"
+  target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-apple-macosx"
+  
+  %class.E = type { %class.D }
+  %class.D = type { %class.B }
+  %class.B = type { %class.A, %class.A }
+  %class.A = type { i8 }
+  %class.C = type <{ %class.E*, %class.B, [2 x i8] }>
+  
+  @a = local_unnamed_addr global %class.E* null, align 4
+  
+  define i32 @_ZN1C5m_fn3Ev(%class.C* nocapture) local_unnamed_addr align 2 !dbg !6 {
+    %2 = alloca %class.B, align 1
+    %3 = load %class.E*, %class.E** @a, align 4
+    %4 = icmp eq %class.E* %3, null
+    br i1 %4, label %10, label %5
+  
+  ; <label>:5:                                      ; preds = %1
+    %6 = bitcast %class.C* %0 to %class.D**
+    %7 = load %class.D*, %class.D** %6, align 4
+    %8 = bitcast %class.D* %7 to i8*
+    %9 = load i8, i8* %8, align 1
+    br label %10
+  
+  ; <label>:10:                                     ; preds = %5, %1
+    %11 = phi i8 [ %9, %5 ], [ undef, %1 ], !dbg !10
+    %12 = getelementptr inbounds %class.C, %class.C* %0, i32 0, i32 1, i32 0, i32 0
+    store i8 %11, i8* %12, align 1, !dbg !14
+    ret i32 undef
+  }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4, !5}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "Apple LLVM version 8.1.0 (clang-802.0.30.3)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2)
+  !1 = !DIFile(filename: "test.ii", directory: "/")
+  !2 = !{}
+  !3 = !{i32 2, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{i32 1, !"PIC Level", i32 2}
+  !6 = distinct !DISubprogram(name: "m_fn3", linkageName: "_ZN1C5m_fn3Ev", scope: !7, file: !1, line: 25, type: !8, isLocal: false, isDefinition: true, scopeLine: 25, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !9, variables: !2)
+  !7 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "C", file: !1, line: 14, size: 64, align: 32, elements: !2, identifier: "_ZTS1C")
+  !8 = !DISubroutineType(types: !2)
+  !9 = !DISubprogram(name: "m_fn3", linkageName: "_ZN1C5m_fn3Ev", scope: !7, file: !1, line: 15, type: !8, isLocal: false, isDefinition: false, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: true)
+  !10 = !DILocation(line: 99, column: 9, scope: !11, inlinedAt: !14)
+  !11 = distinct !DISubprogram(name: "m_fn1", linkageName: "_ZN1A5m_fn1Ev", scope: !12, file: !1, line: 5, type: !8, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !13, variables: !2)
+  !12 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !1, line: 1, size: 8, align: 8, elements: !2, identifier: "_ZTS1A")
+  !13 = !DISubprogram(name: "m_fn1", linkageName: "_ZN1A5m_fn1Ev", scope: !12, file: !1, line: 5, type: !8, isLocal: false, isDefinition: false, scopeLine: 5, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true)
+  !14 = !DILocation(line: 29, column: 28, scope: !6)
+
+...
+---
+name:            _ZN1C5m_fn3Ev
+alignment:       4
+exposesReturnsTwice: false
+noVRegs:         true
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rdi' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+#stack:           
+#  - { id: 0, name: '<unnamed alloca>', offset: -16, size: 2, alignment: 8 }
+body:             |
+  bb.0 (%ir-block.1):
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
+    liveins: %rdi
+  
+    CMP64mi8 %rip, 1, _, @a, _, 0, implicit-def %eflags :: (dereferenceable load 8 from @a, align 4)
+    JE_1 %bb.1, implicit %eflags
+  
+  bb.2 (%ir-block.5):
+    liveins: %rdi
+  
+    %rax = MOV64rm %rdi, 1, _, 0, _ :: (load 8 from %ir.6, align 4)
+    %al = MOV8rm killed %rax, 1, _, 0, _ :: (load 1 from %ir.8)
+    MOV8mr killed %rdi, 1, _, 8, _, killed %al, debug-location !14 :: (store 1 into %ir.12)
+    RETQ undef %eax
+  
+  bb.1:
+    liveins: %rdi
+  
+    %al = IMPLICIT_DEF debug-location !10
+    MOV8mr killed %rdi, 1, _, 8, _, killed %al, debug-location !14 :: (store 1 into %ir.12)
+    RETQ undef %eax
+
+...
diff --git a/test/DebugInfo/PDB/Inputs/merge-ids-1.yaml b/test/DebugInfo/PDB/Inputs/merge-ids-1.yaml
new file mode 100644
index 000000000000..3b5e8b5e761a
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/merge-ids-1.yaml
@@ -0,0 +1,36 @@
+IpiStream:
+  Records:
+    # 'One' [TypeIndex: 0x1000 (4096)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'One'
+    # 'Two' [TypeIndex: 0x1001 (4097)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'Two'
+    # 'OnlyInFirst' [TypeIndex: 0x1002 (4098)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'OnlyInFirst'
+    # 'SubOne' [TypeIndex: 0x1003 (4099)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'SubOne'
+    # 'SubTwo' [TypeIndex: 0x1004 (4100)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'SubTwo'
+    # 'SubOne', 'SubTwo' [TypeIndex: 0x1005 (4101)]
+    - Kind:            LF_SUBSTR_LIST
+      StringList:      
+        StringIndices:   [ 4099, 4100 ]
+    # 'Main' {'SubOne', 'SubTwo'} [TypeIndex: 0x1006 (4102)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              4101
+        String:          'Main'
diff --git a/test/DebugInfo/PDB/Inputs/merge-ids-2.yaml b/test/DebugInfo/PDB/Inputs/merge-ids-2.yaml
new file mode 100644
index 000000000000..74f6ee502249
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/merge-ids-2.yaml
@@ -0,0 +1,31 @@
+IpiStream:       
+  Records:
+    # 'SubTwo' [TypeIndex: 0x1000 (4096)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'SubTwo'
+    # 'OnlyInSecond' [TypeIndex: 0x1001 (4097)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'OnlyInSecond'
+    # 'SubOne' [TypeIndex: 0x1002 (4098)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'SubOne'
+    # 'SubOne', 'SubTwo' [TypeIndex: 0x1003 (4099)]
+    - Kind:            LF_SUBSTR_LIST
+      StringList:      
+        StringIndices:   [ 4098, 4096 ]
+    # 'One' [TypeIndex: 0x1004 (4100)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'One'
+    # 'Main' {'SubOne', 'SubTwo'} [TypeIndex: 0x1005 (4101)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              4099
+        String:          'Main'
diff --git a/test/DebugInfo/PDB/Inputs/merge-ids-and-types-1.yaml b/test/DebugInfo/PDB/Inputs/merge-ids-and-types-1.yaml
new file mode 100644
index 000000000000..30ff563d7fc6
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/merge-ids-and-types-1.yaml
@@ -0,0 +1,113 @@
+# The idea is to set up some types in the TPI stream, and then have records in
+# the IPI stream that refer to them.  There are three types of IPI records that
+# can refer to TPI records.  They are:
+# 1) LF_PROCEDURE - Referred to by LF_FUNC_ID
+# 2) LF_STRUCTURE - Referred to by LF_UDT_MOD_SRC_LINE
+#                   Referred to by LF_UDT_SRC_LINE
+# 3) LF_MFUNCTION - Referred to by LF_MFUNC_ID
+# We will set up one of each of these, and then create IPI records that refer to
+# them.  We intentionally choose an unintuitive ordering of the records in both
+# streams (while still maintaining the topological sorting required by CodeView
+# type streams), to make sure the merging algorithm is sufficiently exercised.
+# For easy understanding, a semantic representation of the types we will set up
+# is as follows:
+#  - int main(int, char**)
+#
+#  - struct FooBar {
+#    public:
+#      void *FooMember;
+#      void FooMethod(int);
+#    };
+TpiStream:
+  Records:
+    # TypeIndex: 4096 (0x1000)
+    # char**
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    1136
+        Attrs:           32778
+    # TypeIndex: 4097 (0x1001)
+    # public void *FooMember
+    - Kind:            LF_FIELDLIST
+      FieldList:       
+        - Kind:            LF_MEMBER
+          DataMember:      
+            Attrs:           3           # public
+            Type:            1027        # void*
+            FieldOffset:     0
+            Name:            FooMember   # FooMember
+    # TypeIndex: 4098 (0x1002)
+    # (int, char**)
+    - Kind:            LF_ARGLIST
+      ArgList:         
+        ArgIndices:      [ 116, 4096 ]
+    # TypeIndex: 4099 (0x1003)
+    # struct FooBar {
+    # public:
+    #   void *FooMember;
+    # };
+    - Kind:            LF_STRUCTURE
+      Class:           
+        MemberCount:     1
+        Options:         [ None, HasUniqueName ]
+        FieldList:       4097
+        Name:            FooBar
+        UniqueName:      'FooBar'
+        DerivationList:  0
+        VTableShape:     0
+        Size:            4
+    # TypeIndex: 4100 (0x1004)
+    # FooBar *
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    4099       # FooBar
+        Attrs:           32778
+    # TypeIndex: 4101 (0x1005)
+    # (int)
+    - Kind:            LF_ARGLIST
+      ArgList:         
+        ArgIndices:      [ 116 ]
+    # TypeIndex: 4102 (0x1006)
+    - Kind:            LF_MFUNCTION
+      MemberFunction:  
+        ReturnType:      3                      # void
+        ClassType:       4099                   # struct FooBar
+        ThisType:        4100                   # FooBar *
+        CallConv:        ThisCall
+        Options:         [ None, Constructor ]
+        ParameterCount:  1
+        ArgumentList:    4101                   # (int)
+        ThisPointerAdjustment: 0
+    # TypeIndex: 4103 (0x1007)
+    # int (int, char**)
+    - Kind:            LF_PROCEDURE
+      Procedure:       
+        ReturnType:      116         # int
+        CallConv:        NearC
+        Options:         [ None ]
+        ParameterCount:  2
+        ArgumentList:    4098        # (int, char**)
+IpiStream:
+  Records:
+    # TypeIndex: 4096 (0x1000)
+    # int main(int, char **)
+    - Kind:            LF_FUNC_ID
+      FuncId:          
+        ParentScope:     0
+        FunctionType:    4103       # int main(int, char**)
+        Name:            main
+    # TypeIndex: 4097 (0x1001)
+    # void FooBar::FooMethod(int)
+    - Kind:            LF_MFUNC_ID
+      MemberFuncId:    
+        ClassType:       4099       # struct FooBar
+        FunctionType:    4102       # void FooMethod(int)
+        Name:            FooMethod
+    # TypeIndex: 4098 (0x1002)
+    # struct FooBar
+    - Kind:            LF_UDT_MOD_SRC_LINE
+      UdtModSourceLine: 
+        UDT:             4099       # struct FooBar
+        SourceFile:      0          # We don't support this yet
+        LineNumber:      0
+        Module:          0          # We don't support this yet
diff --git a/test/DebugInfo/PDB/Inputs/merge-ids-and-types-2.yaml b/test/DebugInfo/PDB/Inputs/merge-ids-and-types-2.yaml
new file mode 100644
index 000000000000..1bd54deebffd
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/merge-ids-and-types-2.yaml
@@ -0,0 +1,143 @@
+# In file 1 we set up some basic types and IDs to refer to them.  In this file
+# we will set up the same types.  For some of them we will make them identical
+# but re-order the records in the file to make sure they have different type
+# indices and appear in different orders.  In other cases we will make slight
+# adjustments to the types, to ensure that they do not get merged in.
+# 
+# For easy understanding, a semantic representation of the types we will set up
+# is as follows:
+#  - int main(int, char**)    // This record should share an LF_PROCEDURE and id
+#                             // record with corresponding function from the
+#                             // first file.
+#  - int main2(int, char**)   // This record should share the LF_PROCEDURE
+#                             // record but have a unique id record.
+#  - void foo(int, char**)    // This record should have a unique LF_PROCEDURE
+#                             // record, but the LF_ARGLIST record internally
+#                             // should be shared.
+#
+#  - struct FooBar {          // Because the type of this record exactly matches
+#                             // the corresponding file, its entire type record
+#                             // hierarchy should be shared.
+#    public:
+#      void *FooMember;
+#      void FooMethod2(int);  // Note that the *type* of this member should be
+#                             // the same as the type of the record from the
+#                             // first stream.  But since it has a different
+#                             // name, it will not share an id record.
+#    };
+TpiStream:
+  Records:
+    # TypeIndex: 4096 (0x1000)
+    # (int)
+    - Kind:            LF_ARGLIST
+      ArgList:         
+        ArgIndices:      [ 116 ]
+    # TypeIndex: 4097 (0x1001)
+    # public void *FooMember
+    - Kind:            LF_FIELDLIST
+      FieldList:       
+        - Kind:            LF_MEMBER
+          DataMember:      
+            Attrs:           3           # public
+            Type:            1027        # void*
+            FieldOffset:     0
+            Name:            FooMember   # FooMember
+    # TypeIndex: 4098 (0x1002)
+    # char**
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    1136
+        Attrs:           32778
+    # TypeIndex: 4099 (0x1003)
+    # (int, char**)
+    - Kind:            LF_ARGLIST
+      ArgList:         
+        ArgIndices:      [ 116, 4098 ]
+    # TypeIndex: 4100 (0x1004)
+    # struct FooBar {
+    # public:
+    #   void *FooMember;
+    # };
+    - Kind:            LF_STRUCTURE
+      Class:           
+        MemberCount:     1
+        Options:         [ None, HasUniqueName ]
+        FieldList:       4097
+        Name:            FooBar
+        UniqueName:      'FooBar'
+        DerivationList:  0
+        VTableShape:     0
+        Size:            4
+    # TypeIndex: 4101 (0x1005)
+    # void (int, char**)
+    - Kind:            LF_PROCEDURE
+      Procedure:       
+        ReturnType:      3           # void
+        CallConv:        NearC
+        Options:         [ None ]
+        ParameterCount:  2
+        ArgumentList:    4099        # (int, char**)
+    # TypeIndex: 4102 (0x1006)
+    # FooBar *
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    4100       # FooBar
+        Attrs:           32778
+    # TypeIndex: 4103 (0x1007)
+    # int (int, char**)
+    - Kind:            LF_PROCEDURE
+      Procedure:       
+        ReturnType:      116         # int
+        CallConv:        NearC
+        Options:         [ None ]
+        ParameterCount:  2
+        ArgumentList:    4099        # (int, char**)
+    # TypeIndex: 4104 (0x1008)
+    - Kind:            LF_MFUNCTION
+      MemberFunction:  
+        ReturnType:      3                      # void
+        ClassType:       4100                   # struct FooBar
+        ThisType:        4102                   # FooBar *
+        CallConv:        ThisCall
+        Options:         [ None, Constructor ]
+        ParameterCount:  1
+        ArgumentList:    4096                   # (int)
+        ThisPointerAdjustment: 0
+IpiStream:
+  Records:
+    # TypeIndex: 4096 (0x1000)
+    # struct FooBar
+    - Kind:            LF_UDT_MOD_SRC_LINE
+      UdtModSourceLine: 
+        UDT:             4100       # struct FooBar
+        SourceFile:      0          # We don't support this yet
+        LineNumber:      0
+        Module:          0          # We don't support this yet
+    # TypeIndex: 4097 (0x1001)
+    # int main2(int, char **)
+    - Kind:            LF_FUNC_ID
+      FuncId:          
+        ParentScope:     0
+        FunctionType:    4103       # int main2(int, char**)
+        Name:            main2
+    # TypeIndex: 4098 (0x1002)
+    # void foo(int, char **)
+    - Kind:            LF_FUNC_ID
+      FuncId:          
+        ParentScope:     0
+        FunctionType:    4101       # void main2(int, char**)
+        Name:            foo
+    # TypeIndex: 4099 (0x1003)
+    # void FooBar::FooMethod2(int)
+    - Kind:            LF_MFUNC_ID
+      MemberFuncId:    
+        ClassType:       4100       # struct FooBar
+        FunctionType:    4104       # void FooBar::FooMethod2(int)
+        Name:            FooMethod2
+    # TypeIndex: 4100 (0x1004)
+    # int main(int, char **)
+    - Kind:            LF_FUNC_ID
+      FuncId:          
+        ParentScope:     0
+        FunctionType:    4103       # int main(int, char**)
+        Name:            main
diff --git a/test/DebugInfo/PDB/Inputs/merge1.yaml b/test/DebugInfo/PDB/Inputs/merge-types-1.yaml
index 89d471e3343d..89d471e3343d 100644
--- a/test/DebugInfo/PDB/Inputs/merge1.yaml
+++ b/test/DebugInfo/PDB/Inputs/merge-types-1.yaml
diff --git a/test/DebugInfo/PDB/Inputs/merge2.yaml b/test/DebugInfo/PDB/Inputs/merge-types-2.yaml
index b6cbdb98f0ca..b6cbdb98f0ca 100644
--- a/test/DebugInfo/PDB/Inputs/merge2.yaml
+++ b/test/DebugInfo/PDB/Inputs/merge-types-2.yaml
diff --git a/test/DebugInfo/PDB/Inputs/source-names-1.yaml b/test/DebugInfo/PDB/Inputs/source-names-1.yaml
new file mode 100644
index 000000000000..96f7dedd2fc4
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/source-names-1.yaml
@@ -0,0 +1,8 @@
+---
+DbiStream:
+  Modules:
+    - Module:          'C:\src\test.obj'
+      ObjFile:         'C:\src\test.obj'
+      SourceFiles:
+        - 'C:\src\test.c'
+...
diff --git a/test/DebugInfo/PDB/Inputs/source-names-2.yaml b/test/DebugInfo/PDB/Inputs/source-names-2.yaml
new file mode 100644
index 000000000000..5f782ddbca25
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/source-names-2.yaml
@@ -0,0 +1,8 @@
+---
+DbiStream:
+  Modules:
+    - Module:          'C:\src\test.obj'
+      ObjFile:         'C:\src\test.obj'
+      SourceFiles:
+        - 'C:\src\test.cc'
+...
diff --git a/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test b/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test
new file mode 100644
index 000000000000..ac32ce040b98
--- /dev/null
+++ b/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test
@@ -0,0 +1,65 @@
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.1.pdb %p/Inputs/merge-ids-and-types-1.yaml
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.2.pdb %p/Inputs/merge-ids-and-types-2.yaml
+; RUN: llvm-pdbdump merge -pdb=%t.3.pdb %t.1.pdb %t.2.pdb
+; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=TPI-TYPES %s
+; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=INTMAIN %s
+; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=VOIDMAIN %s
+; RUN: llvm-pdbdump raw -ipi-records %t.3.pdb | FileCheck -check-prefix=IPI-TYPES %s
+; RUN: llvm-pdbdump raw -ipi-records %t.3.pdb | FileCheck -check-prefix=IPI-NAMES %s
+; RUN: llvm-pdbdump raw -ipi-records %t.3.pdb | FileCheck -check-prefix=IPI-UDT %s
+
+TPI-TYPES:     Type Info Stream (TPI)
+TPI-TYPES:     Record count: 9
+TPI-TYPES-DAG: TypeLeafKind: LF_POINTER
+TPI-TYPES-DAG: TypeLeafKind: LF_FIELDLIST
+TPI-TYPES-DAG: TypeLeafKind: LF_ARGLIST
+TPI-TYPES-DAG: TypeLeafKind: LF_STRUCTURE
+TPI-TYPES-DAG: TypeLeafKind: LF_MEMBER
+TPI-TYPES-DAG: TypeLeafKind: LF_POINTER
+TPI-TYPES-DAG: TypeLeafKind: LF_ARGLIST
+TPI-TYPES-DAG: TypeLeafKind: LF_MFUNCTION
+TPI-TYPES-DAG: TypeLeafKind: LF_PROCEDURE
+TPI-TYPES-DAG: TypeLeafKind: LF_PROCEDURE
+TPI-TYPES-DAG: TypeLeafKind: LF_ARGLIST
+
+; Both procedures should use the same arglist even though they have a different
+; return type.
+INTMAIN:      ArgList ([[ID:.*]])
+INTMAIN-NEXT:   TypeLeafKind: LF_ARGLIST
+INTMAIN-NEXT:   NumArgs: 2
+INTMAIN-NEXT:   Arguments [
+INTMAIN-NEXT:     ArgType: int
+INTMAIN-NEXT:     ArgType: char**
+INTMAIN:        TypeLeafKind: LF_PROCEDURE
+INTMAIN:          ReturnType: int
+INTMAIN:          NumParameters: 2
+INTMAIN-NEXT:     ArgListType: (int, char**) ([[ID]])
+
+VOIDMAIN:      ArgList ([[ID:.*]])
+VOIDMAIN-NEXT:   TypeLeafKind: LF_ARGLIST
+VOIDMAIN-NEXT:   NumArgs: 2
+VOIDMAIN-NEXT:   Arguments [
+VOIDMAIN-NEXT:     ArgType: int
+VOIDMAIN-NEXT:     ArgType: char**
+VOIDMAIN:        TypeLeafKind: LF_PROCEDURE
+VOIDMAIN:          ReturnType: void
+VOIDMAIN:          NumParameters: 2
+VOIDMAIN-NEXT:     ArgListType: (int, char**) ([[ID]])
+
+IPI-TYPES:     Type Info Stream (IPI)
+IPI-TYPES:     Record count: 6
+IPI-TYPES-DAG: TypeLeafKind: LF_FUNC_ID
+IPI-TYPES-DAG: TypeLeafKind: LF_MFUNC_ID
+IPI-TYPES-DAG: TypeLeafKind: LF_UDT_MOD_SRC_LINE
+IPI-TYPES-DAG: TypeLeafKind: LF_FUNC_ID
+IPI-TYPES-DAG: TypeLeafKind: LF_FUNC_ID
+IPI-TYPES-DAG: TypeLeafKind: LF_MFUNC_ID
+
+IPI-NAMES-DAG: Name: main
+IPI-NAMES-DAG: Name: FooMethod
+IPI-NAMES-DAG: Name: main2
+IPI-NAMES-DAG: Name: foo
+IPI-NAMES-DAG: Name: FooMethod2
+
+IPI-UDT:      TypeLeafKind: LF_UDT_MOD_SRC_LINE
+IPI-UDT-NEXT: UDT: FooBar
diff --git a/test/DebugInfo/PDB/pdbdump-mergeids.test b/test/DebugInfo/PDB/pdbdump-mergeids.test
new file mode 100644
index 000000000000..6a4d19eba042
--- /dev/null
+++ b/test/DebugInfo/PDB/pdbdump-mergeids.test
@@ -0,0 +1,31 @@
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.1.pdb %p/Inputs/merge-ids-1.yaml
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.2.pdb %p/Inputs/merge-ids-2.yaml
+; RUN: llvm-pdbdump merge -pdb=%t.3.pdb %t.1.pdb %t.2.pdb
+; RUN: llvm-pdbdump raw -ipi-records %t.3.pdb | FileCheck -check-prefix=MERGED %s
+; RUN: llvm-pdbdump raw -ipi-records %t.3.pdb | FileCheck -check-prefix=SUBSTRS %s
+; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=TPI-EMPTY %s
+
+
+MERGED: Type Info Stream (IPI)
+MERGED: Record count: 8
+MERGED-DAG: StringData: One
+MERGED-DAG: StringData: Two
+MERGED-DAG: StringData: SubOne
+MERGED-DAG: StringData: SubTwo
+MERGED-DAG: StringData: Main
+MERGED-DAG: TypeLeafKind: LF_SUBSTR_LIST
+MERGED-DAG: StringData: OnlyInFirst
+MERGED-DAG: StringData: OnlyInSecond
+
+SUBSTRS:      StringList
+SUBSTRS:        TypeLeafKind: LF_SUBSTR_LIST
+SUBSTRS-NEXT:   NumStrings: 2
+SUBSTRS-NEXT:   Strings [
+SUBSTRS-NEXT:     SubOne
+SUBSTRS-NEXT:     SubTwo
+SUBSTRS:      StringId
+SUBSTRS-NEXT:   TypeLeafKind: LF_STRING_ID
+SUBSTRS-NEXT:   Id: "SubOne" "SubTwo"
+SUBSTRS-NEXT:   StringData: Main
+
+TPI-EMPTY: Record count: 0
diff --git a/test/DebugInfo/PDB/pdbdump-mergetypes.test b/test/DebugInfo/PDB/pdbdump-mergetypes.test
index 96f6316d4766..a26b92631828 100644
--- a/test/DebugInfo/PDB/pdbdump-mergetypes.test
+++ b/test/DebugInfo/PDB/pdbdump-mergetypes.test
@@ -1,5 +1,5 @@
-; RUN: llvm-pdbdump yaml2pdb -pdb=%t.1.pdb %p/Inputs/merge1.yaml
-; RUN: llvm-pdbdump yaml2pdb -pdb=%t.2.pdb %p/Inputs/merge2.yaml
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.1.pdb %p/Inputs/merge-types-1.yaml
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.2.pdb %p/Inputs/merge-types-2.yaml
 ; RUN: llvm-pdbdump merge -pdb=%t.3.pdb %t.1.pdb %t.2.pdb
 ; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=MERGED %s
 ; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=ARGLIST %s
diff --git a/test/DebugInfo/PDB/pdbdump-objfilename.yaml b/test/DebugInfo/PDB/pdbdump-objfilename.yaml
new file mode 100644
index 000000000000..fac9ce9083c7
--- /dev/null
+++ b/test/DebugInfo/PDB/pdbdump-objfilename.yaml
@@ -0,0 +1,14 @@
+# RUN: llvm-pdbdump yaml2pdb -pdb=%T/objfilename.pdb %s
+# RUN: llvm-pdbdump pdb2yaml -dbi-module-info %T/objfilename.pdb \
+# RUN:     | FileCheck %s
+#
+# CHECK: DbiStream:
+# CHECK: Modules:
+# CHECK-NEXT: - Module:{{ *}}'C:\src\test.obj'
+# CHECK-NEXT: ObjFile:{{ *}}'C:\src\test.obj'
+---
+DbiStream:
+  Modules:
+    - Module:          'C:\src\test.obj'
+      ObjFile:         'C:\src\test.obj'
+...
diff --git a/test/DebugInfo/PDB/pdbdump-source-names.test b/test/DebugInfo/PDB/pdbdump-source-names.test
new file mode 100644
index 000000000000..181f4d5e0ee4
--- /dev/null
+++ b/test/DebugInfo/PDB/pdbdump-source-names.test
@@ -0,0 +1,20 @@
+# Test that we can write source file names to PDBs and read them back.
+# Because the subsection the file names are stored in is 4-byte
+# aligned, there is a possibility of misaligning the file names. This
+# will cause them to be read back empty or truncated.  To guard
+# against this, we test with two different lengths of file name data
+# that differ by one byte, so that at least one of those will only
+# pass if alignment is implemented correctly.
+
+RUN: llvm-pdbdump yaml2pdb -pdb=%T/source-names-1.pdb %p/Inputs/source-names-1.yaml
+RUN: llvm-pdbdump pdb2yaml -dbi-module-source-info %T/source-names-1.pdb \
+RUN:     | FileCheck -check-prefix=CHECK1 %s
+RUN: llvm-pdbdump yaml2pdb -pdb=%T/source-names-2.pdb %p/Inputs/source-names-2.yaml
+RUN: llvm-pdbdump pdb2yaml -dbi-module-source-info %T/source-names-2.pdb \
+RUN:     | FileCheck -check-prefix=CHECK2 %s
+
+CHECK1: SourceFiles:
+CHECK1: 'C:\src\test.c'
+
+CHECK2: SourceFiles:
+CHECK2: 'C:\src\test.cc'
diff --git a/test/DebugInfo/X86/array.ll b/test/DebugInfo/X86/array.ll
index 78cffcf69cf9..78dc12b4d377 100644
--- a/test/DebugInfo/X86/array.ll
+++ b/test/DebugInfo/X86/array.ll
@@ -16,85 +16,109 @@
 ; Test that we only emit register-indirect locations for the array array.
 ; rdar://problem/14874886
 ;
-; CHECK:     ##DEBUG_VALUE: main:array <- [%RSP+0]
 ; CHECK-NOT: ##DEBUG_VALUE: main:array <- %R{{.*}}
+; CHECK: movq    %rsp, %rdi
+; CHECK-NOT: ##DEBUG_VALUE: main:array <- %R{{.*}}
+; CHECK:     ##DEBUG_VALUE: main:array <- [%RDI+0]
+; CHECK-NOT: ##DEBUG_VALUE: main:array <- %R{{.*}}
+; ModuleID = '/tmp/array.c'
+source_filename = "/tmp/array.c"
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.9.0"
+target triple = "x86_64-apple-macosx10.12.0"
 
 @main.array = private unnamed_addr constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 16
 
 ; Function Attrs: nounwind ssp uwtable
-define void @f(i32* nocapture %p) #0 !dbg !4 {
-  tail call void @llvm.dbg.value(metadata i32* %p, i64 0, metadata !11, metadata !DIExpression()), !dbg !28
-  store i32 42, i32* %p, align 4, !dbg !29, !tbaa !30
-  ret void, !dbg !34
+define void @f(i32* nocapture %p) local_unnamed_addr #0 !dbg !8 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32* %p, i64 0, metadata !14, metadata !15), !dbg !16
+  store i32 42, i32* %p, align 4, !dbg !17, !tbaa !18
+  ret void, !dbg !22
 }
 
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
 ; Function Attrs: nounwind ssp uwtable
-define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 !dbg !12 {
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) local_unnamed_addr #0 !dbg !23 {
+entry:
   %array = alloca [4 x i32], align 16
-  tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !19, metadata !DIExpression()), !dbg !35
-  tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !20, metadata !DIExpression()), !dbg !35
-  tail call void @llvm.dbg.value(metadata [4 x i32]* %array, i64 0, metadata !21, metadata !DIExpression(DW_OP_deref)), !dbg !36
-  %1 = bitcast [4 x i32]* %array to i8*, !dbg !36
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([4 x i32]* @main.array to i8*), i64 16, i32 16, i1 false), !dbg !36
-  tail call void @llvm.dbg.value(metadata [4 x i32]* %array, i64 0, metadata !21, metadata !DIExpression(DW_OP_deref)), !dbg !36
-  %2 = getelementptr inbounds [4 x i32], [4 x i32]* %array, i64 0, i64 0, !dbg !37
-  call void @f(i32* %2), !dbg !37
-  tail call void @llvm.dbg.value(metadata [4 x i32]* %array, i64 0, metadata !21, metadata !DIExpression(DW_OP_deref)), !dbg !36
-  %3 = load i32, i32* %2, align 16, !dbg !38, !tbaa !30
-  ret i32 %3, !dbg !38
+  tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !30, metadata !15), !dbg !36
+  tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !31, metadata !15), !dbg !37
+  %0 = bitcast [4 x i32]* %array to i8*, !dbg !38
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %0) #3, !dbg !38
+  tail call void @llvm.dbg.declare(metadata [4 x i32]* %array, metadata !32, metadata !15), !dbg !39
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %0, i8* bitcast ([4 x i32]* @main.array to i8*), i64 16, i32 16, i1 false), !dbg !39
+  %arraydecay = getelementptr inbounds [4 x i32], [4 x i32]* %array, i64 0, i64 0, !dbg !40
+  call void @f(i32* nonnull %arraydecay), !dbg !41
+  %1 = load i32, i32* %arraydecay, align 16, !dbg !42, !tbaa !18
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %0) #3, !dbg !43
+  ret i32 %1, !dbg !44
 }
 
-; Function Attrs: nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2
 
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind ssp uwtable }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind readnone }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!25, !26}
-!llvm.ident = !{!27}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5.0 ", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
-!1 = !DIFile(filename: "array.c", directory: "")
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (trunk 303873) (llvm/trunk 303875)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/array.c", directory: "/")
 !2 = !{}
-!4 = distinct !DISubprogram(name: "f", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, variables: !10)
-!5 = !DIFile(filename: "array.c", directory: "")
-!6 = !DISubroutineType(types: !7)
-!7 = !{null, !8}
-!8 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !9)
-!9 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!10 = !{!11}
-!11 = !DILocalVariable(name: "p", line: 1, arg: 1, scope: !4, file: !5, type: !8)
-!12 = distinct !DISubprogram(name: "main", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 5, file: !1, scope: !5, type: !13, variables: !18)
-!13 = !DISubroutineType(types: !14)
-!14 = !{!9, !9, !15}
-!15 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !16)
-!16 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !17)
-!17 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!18 = !{!19, !20, !21}
-!19 = !DILocalVariable(name: "argc", line: 5, arg: 1, scope: !12, file: !5, type: !9)
-!20 = !DILocalVariable(name: "argv", line: 5, arg: 2, scope: !12, file: !5, type: !15)
-!21 = !DILocalVariable(name: "array", line: 6, scope: !12, file: !5, type: !22)
-!22 = !DICompositeType(tag: DW_TAG_array_type, size: 128, align: 32, baseType: !9, elements: !23)
-!23 = !{!24}
-!24 = !DISubrange(count: 4)
-!25 = !{i32 2, !"Dwarf Version", i32 2}
-!26 = !{i32 1, !"Debug Info Version", i32 3}
-!27 = !{!"clang version 3.5.0 "}
-!28 = !DILocation(line: 1, scope: !4)
-!29 = !DILocation(line: 2, scope: !4)
-!30 = !{!31, !31, i64 0}
-!31 = !{!"int", !32, i64 0}
-!32 = !{!"omnipotent char", !33, i64 0}
-!33 = !{!"Simple C/C++ TBAA"}
-!34 = !DILocation(line: 3, scope: !4)
-!35 = !DILocation(line: 5, scope: !12)
-!36 = !DILocation(line: 6, scope: !12)
-!37 = !DILocation(line: 7, scope: !12)
-!38 = !DILocation(line: 8, scope: !12)
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{!"clang version 5.0.0 (trunk 303873) (llvm/trunk 303875)"}
+!8 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !9, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !13)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null, !11}
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{!14}
+!14 = !DILocalVariable(name: "p", arg: 1, scope: !8, file: !1, line: 1, type: !11)
+!15 = !DIExpression()
+!16 = !DILocation(line: 1, column: 13, scope: !8)
+!17 = !DILocation(line: 2, column: 8, scope: !8)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"int", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 3, column: 1, scope: !8)
+!23 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 5, type: !24, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !29)
+!24 = !DISubroutineType(types: !25)
+!25 = !{!12, !12, !26}
+!26 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !27, size: 64)
+!27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !28, size: 64)
+!28 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!29 = !{!30, !31, !32}
+!30 = !DILocalVariable(name: "argc", arg: 1, scope: !23, file: !1, line: 5, type: !12)
+!31 = !DILocalVariable(name: "argv", arg: 2, scope: !23, file: !1, line: 5, type: !26)
+!32 = !DILocalVariable(name: "array", scope: !23, file: !1, line: 6, type: !33)
+!33 = !DICompositeType(tag: DW_TAG_array_type, baseType: !12, size: 128, elements: !34)
+!34 = !{!35}
+!35 = !DISubrange(count: 4)
+!36 = !DILocation(line: 5, column: 14, scope: !23)
+!37 = !DILocation(line: 5, column: 27, scope: !23)
+!38 = !DILocation(line: 6, column: 3, scope: !23)
+!39 = !DILocation(line: 6, column: 7, scope: !23)
+!40 = !DILocation(line: 7, column: 5, scope: !23)
+!41 = !DILocation(line: 7, column: 3, scope: !23)
+!42 = !DILocation(line: 8, column: 10, scope: !23)
+!43 = !DILocation(line: 9, column: 1, scope: !23)
+!44 = !DILocation(line: 8, column: 3, scope: !23)
diff --git a/test/DebugInfo/X86/dbg-value-frame-index.ll b/test/DebugInfo/X86/dbg-value-frame-index.ll
index 7b49aacfaefd..3c3c21257ed9 100644
--- a/test/DebugInfo/X86/dbg-value-frame-index.ll
+++ b/test/DebugInfo/X86/dbg-value-frame-index.ll
@@ -32,7 +32,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 !3 = !{i32 2, !"Debug Info Version", i32 3}
 !4 = distinct !DISubprogram(name: "test", type: !10, unit: !0)
 !5 = !DILocalVariable(name: "w", scope: !4, type: !9)
-!6 = !DIExpression(DW_OP_deref)
+!6 = !DIExpression()
 !7 = !DILocation(line: 210, column: 12, scope: !4)
 !8 = !{!9}
 !9 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean)
diff --git a/test/DebugInfo/X86/debug-loc-offset.ll b/test/DebugInfo/X86/debug-loc-offset.ll
index 8e3e1e97c319..8f2210e2c014 100644
--- a/test/DebugInfo/X86/debug-loc-offset.ll
+++ b/test/DebugInfo/X86/debug-loc-offset.ll
@@ -35,10 +35,6 @@
 ; CHECK: DW_AT_low_pc
 ; CHECK: DW_AT_high_pc
 
-; CHECK: DW_TAG_compile_unit
-; CHECK: DW_AT_low_pc
-; CHECK: DW_AT_high_pc
-
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_linkage_name [DW_FORM_strp]{{.*}}"_Z3baz1A"
@@ -53,6 +49,10 @@
 ; CHECK: DW_AT_location [DW_FORM_exprloc]
 ; CHECK-NOT: DW_AT_location
 
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_AT_low_pc
+; CHECK: DW_AT_high_pc
+
 ; CHECK: .debug_loc contents:
 ; CHECK: 0x00000000: Beginning address offset: 0x0000000000000000
 ; CHECK:                Ending address offset: 0x0000000000000017
diff --git a/test/DebugInfo/X86/debug-macro.ll b/test/DebugInfo/X86/debug-macro.ll
index 2b3adce4776e..a8b3d4b9b87e 100644
--- a/test/DebugInfo/X86/debug-macro.ll
+++ b/test/DebugInfo/X86/debug-macro.ll
@@ -1,40 +1,38 @@
-; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck --check-prefix=CHECK-INFO %s
-; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=macro - | FileCheck --check-prefix=CHECK-MACRO %s
-; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=line - | FileCheck --check-prefix=CHECK-LINE %s
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump - | FileCheck %s
 
 
-; CHECK-INFO: .debug_info contents:
-; CHECK-INFO: DW_TAG_compile_unit
-; CHECK-INFO-NOT: DW_TAG
-; CHECK-INFO:   DW_AT_name {{.*}}"debug-macro.cpp")
-; CHECK-INFO:   DW_AT_macro_info {{.*}}(0x00000000)
-; CHECK-INFO: DW_TAG_compile_unit
-; CHECK-INFO-NOT: DW_TAG
-; CHECK-INFO:   DW_AT_name {{.*}}"debug-macro1.cpp")
-; CHECK-INFO:   DW_AT_macro_info {{.*}}(0x00000044)
-; CHECK-INFO: DW_TAG_compile_unit
-; CHECK-INFO-NOT: DW_TAG
-; CHECK-INFO:   DW_AT_name {{.*}}"debug-macro2.cpp")
-; CHECK-INFO-NOT: DW_AT_macro_info
+; CHECK-LABEL: .debug_info contents:
+; CHECK: DW_TAG_compile_unit
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}}"debug-macro.cpp")
+; CHECK:   DW_AT_macro_info {{.*}}(0x00000000)
+; CHECK: DW_TAG_compile_unit
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}}"debug-macro1.cpp")
+; CHECK:   DW_AT_macro_info {{.*}}(0x00000044)
+; CHECK: DW_TAG_compile_unit
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}}"debug-macro2.cpp")
+; CHECK-NOT: DW_AT_macro_info
 
-; CHECK-MACRO:     .debug_macinfo contents:
-; CHECK-MACRO-NEXT: DW_MACINFO_define - lineno: 0 macro: NameCMD ValueCMD
-; CHECK-MACRO-NEXT: DW_MACINFO_start_file - lineno: 0 filenum: 1
-; CHECK-MACRO-NEXT:   DW_MACINFO_start_file - lineno: 9 filenum: 2
-; CHECK-MACRO-NEXT:     DW_MACINFO_define - lineno: 1 macro: NameDef Value
-; CHECK-MACRO-NEXT:     DW_MACINFO_undef - lineno: 11 macro: NameUndef
-; CHECK-MACRO-NEXT:   DW_MACINFO_end_file
-; CHECK-MACRO-NEXT:   DW_MACINFO_undef - lineno: 10 macro: NameUndef2
-; CHECK-MACRO-NEXT: DW_MACINFO_end_file
-; CHECK-MACRO-NEXT: DW_MACINFO_start_file - lineno: 0 filenum: 1
-; CHECK-MACRO-NEXT: DW_MACINFO_end_file
+; CHECK-LABEL:     .debug_macinfo contents:
+; CHECK-NEXT: DW_MACINFO_define - lineno: 0 macro: NameCMD ValueCMD
+; CHECK-NEXT: DW_MACINFO_start_file - lineno: 0 filenum: 1
+; CHECK-NEXT:   DW_MACINFO_start_file - lineno: 9 filenum: 2
+; CHECK-NEXT:     DW_MACINFO_define - lineno: 1 macro: NameDef Value
+; CHECK-NEXT:     DW_MACINFO_undef - lineno: 11 macro: NameUndef
+; CHECK-NEXT:   DW_MACINFO_end_file
+; CHECK-NEXT:   DW_MACINFO_undef - lineno: 10 macro: NameUndef2
+; CHECK-NEXT: DW_MACINFO_end_file
+; CHECK-NEXT: DW_MACINFO_start_file - lineno: 0 filenum: 1
+; CHECK-NEXT: DW_MACINFO_end_file
 
-; CHECK-LINE: .debug_line contents:
-; CHECK-LINE: Dir  Mod Time   File Len   File Name
-; CHECK-LINE: file_names[  1] {{.*}}debug-macro.cpp
-; CHECK-LINE: file_names[  2] {{.*}}debug-macro.h
-; CHECK-LINE: Dir  Mod Time   File Len   File Name
-; CHECK-LINE: file_names[  1] {{.*}}debug-macro1.cpp
+; CHECK-LABEL: .debug_line contents:
+; CHECK: Dir  Mod Time   File Len   File Name
+; CHECK: file_names[  1] {{.*}}debug-macro.cpp
+; CHECK: file_names[  2] {{.*}}debug-macro.h
+; CHECK: Dir  Mod Time   File Len   File Name
+; CHECK: file_names[  1] {{.*}}debug-macro1.cpp
 
 !llvm.dbg.cu = !{!0, !16, !20}
 !llvm.module.flags = !{!13, !14}
@@ -58,10 +56,14 @@
 !14 = !{i32 1, !"Debug Info Version", i32 3}
 !15 = !{!"clang version 3.5.0 "}
 
-!16 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !17, enums: !2, retainedTypes: !2, globals: !2, imports: !2, macros: !18)
+!16 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !17, enums: !2, retainedTypes: !22, globals: !2, imports: !2, macros: !18)
 !17 = !DIFile(filename: "debug-macro1.cpp", directory: "/")
 !18 = !{!19}
 !19 = !DIMacroFile(line: 0, file: !17, nodes: !2)
 
-!20 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !21, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!20 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !21, enums: !2, retainedTypes: !24, globals: !2, imports: !2)
 !21 = !DIFile(filename: "debug-macro2.cpp", directory: "/")
+!22 = !{!23}
+!23 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!24 = !{!25}
+!25 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
diff --git a/test/DebugInfo/X86/empty.ll b/test/DebugInfo/X86/empty.ll
index 774b908adb30..b89af579fd90 100644
--- a/test/DebugInfo/X86/empty.ll
+++ b/test/DebugInfo/X86/empty.ll
@@ -8,10 +8,9 @@
 ; CHECK-NOT: file_names[
 
 ; CHECK: .debug_pubnames contents:
-; CHECK-NOT: Offset
-
-; CHECK: .debug_pubtypes contents:
-; CHECK-NOT: Offset
+; CHECK-NEXT: length = 0x0000000e
+; CHECK-NEXT: Offset
+; CHECK-NEXT: {{^$}}
 
 ; Don't emit DW_AT_addr_base when there are no addresses.
 ; FISSION-NOT: DW_AT_GNU_addr_base [DW_FORM_sec_offset]
@@ -19,8 +18,10 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!5}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: FullDebug, file: !4, enums: !2, retainedTypes: !2, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: FullDebug, file: !4, enums: !2, retainedTypes: !6, globals: !2)
 !2 = !{}
 !3 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
 !4 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
 !5 = !{i32 1, !"Debug Info Version", i32 3}
+!6 = !{!7}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/DebugInfo/X86/fission-hash.ll b/test/DebugInfo/X86/fission-hash.ll
index 1a5fba293175..de9966ab0be0 100644
--- a/test/DebugInfo/X86/fission-hash.ll
+++ b/test/DebugInfo/X86/fission-hash.ll
@@ -1,16 +1,18 @@
 ; RUN: llc -split-dwarf-file=foo.dwo -O0 %s -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
 ; RUN: llvm-dwarfdump -debug-dump=all %t | FileCheck %s
 
-; The source is an empty file.
+; The source is an empty file, modified to include/retain an 'int' type, since empty CUs are omitted.
 
-; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x0c1e629c9e5ada4f)
-; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x0c1e629c9e5ada4f)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x50d985146a74bb00)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x50d985146a74bb00)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 188230) (llvm/trunk 188234)", isOptimized: false, splitDebugFilename: "foo.dwo", emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 188230) (llvm/trunk 188234)", isOptimized: false, splitDebugFilename: "foo.dwo", emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !5, globals: !2, imports: !2)
 !1 = !DIFile(filename: "foo.c", directory: "/usr/local/google/home/echristo/tmp")
 !2 = !{}
 !3 = !{i32 2, !"Dwarf Version", i32 3}
 !4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = !{!6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/DebugInfo/X86/gnu-public-names-empty.ll b/test/DebugInfo/X86/gnu-public-names-empty.ll
index b04f7af64908..c5d44ad0f082 100644
--- a/test/DebugInfo/X86/gnu-public-names-empty.ll
+++ b/test/DebugInfo/X86/gnu-public-names-empty.ll
@@ -9,11 +9,18 @@
 ; CHECK: DW_AT_GNU_pubnames [DW_FORM_flag_present]   (true)
 ; CHECK-NOT: DW_AT_GNU_pubtypes [
 
+; CHECK: .debug_gnu_pubnames contents:
+; CHECK-NEXT: length = 0x0000000e
+; CHECK-NEXT: Offset
+; CHECK-NEXT: {{^$}}
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 191846) (llvm/trunk 191866)", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 191846) (llvm/trunk 191866)", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !5, globals: !2, imports: !2)
 !1 = !DIFile(filename: "foo.c", directory: "/usr/local/google/home/echristo/tmp")
 !2 = !{}
 !3 = !{i32 2, !"Dwarf Version", i32 4}
 !4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = !{!6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/DebugInfo/X86/gnu-public-names-gmlt.ll b/test/DebugInfo/X86/gnu-public-names-gmlt.ll
new file mode 100644
index 000000000000..569f56a50af0
--- /dev/null
+++ b/test/DebugInfo/X86/gnu-public-names-gmlt.ll
@@ -0,0 +1,68 @@
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -filetype=obj < %s -generate-gnu-dwarf-pub-sections | llvm-dwarfdump - | FileCheck --check-prefix=GPUB --check-prefix=CHECK %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -filetype=obj < %s -generate-dwarf-pub-sections=Enable | llvm-dwarfdump - | FileCheck --check-prefix=PUB --check-prefix=CHECK %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -filetype=obj < %s | llvm-dwarfdump - | FileCheck --check-prefix=NONE %s
+
+; Generated from:
+;   void f1();
+;   inline __attribute__((always_inline)) void f2() {
+;     f1();
+;   }
+;   void f3() {
+;     f2();
+;   }
+;   $ clang++ -gmlt %s -emit-llvm -S
+
+; GPUB: Compile Unit
+; GPUB: DW_AT_GNU_pubnames
+
+; GPUB: .debug_gnu_pubnames contents:
+; PUB: .debug_pubnames contents:
+; CHECK-NEXT: unit_offset = 0x00000000
+; CHECK-NEXT: Name
+; CHECK-NEXT: "f2"
+; CHECK-NEXT: "f3"
+
+; GPUB: .debug_gnu_pubtypes contents:
+; PUB: .debug_pubtypes contents:
+; CHECK-NEXT: length = 0x0000000e version = 0x0002 unit_offset = 0x00000000
+; CHECK-NEXT: Name
+
+; NONE: .debug_pubnames contents:
+; NONE: {{^$}}
+; NONE: .debug_pubtypes contents:
+; NONE: {{^$}}
+; NONE: .debug_gnu_pubnames contents:
+; NONE: {{^$}}
+; NONE: .debug_gnu_pubtypes contents:
+; NONE: {{^$}}
+
+
+; Function Attrs: noinline uwtable
+define void @_Z2f3v() #0 !dbg !7 {
+entry:
+  call void @_Z2f1v(), !dbg !9
+  ret void, !dbg !12
+}
+
+declare void @_Z2f1v() #1
+
+attributes #0 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 303768) (llvm/trunk 303774)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "gnu-public-names-gmlt.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 5.0.0 (trunk 303768) (llvm/trunk 303774)"}
+!7 = distinct !DISubprogram(name: "f3", scope: !1, file: !1, line: 5, type: !8, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 3, column: 3, scope: !10, inlinedAt: !11)
+!10 = distinct !DISubprogram(name: "f2", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!11 = distinct !DILocation(line: 6, column: 3, scope: !7)
+!12 = !DILocation(line: 7, column: 1, scope: !7)
diff --git a/test/DebugInfo/X86/split-dwarf-multiple-cu-hash.ll b/test/DebugInfo/X86/split-dwarf-multiple-cu-hash.ll
new file mode 100644
index 000000000000..f801ade27bd9
--- /dev/null
+++ b/test/DebugInfo/X86/split-dwarf-multiple-cu-hash.ll
@@ -0,0 +1,42 @@
+; RUN: %llc_dwarf -split-dwarf-file=foo.dwo  %s -filetype=obj -o %T/a.o
+; RUN: %llc_dwarf -split-dwarf-file=bar.dwo  %s -filetype=obj -o %T/b.o
+; RUN: llvm-dwarfdump -debug-dump=info %T/a.o %T/b.o | FileCheck %s
+
+; CHECK: dwo_id {{.*}}([[HASH:.*]])
+; CHECK-NOT: dwo_id {{.*}}([[HASH]])
+
+target triple = "x86_64-pc-linux"
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z1av() #0 !dbg !9 {
+entry:
+  ret void, !dbg !12
+}
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z1bv() #0 !dbg !13 {
+entry:
+  ret void, !dbg !14
+}
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0, !3}
+!llvm.ident = !{!5, !5}
+!llvm.module.flags = !{!6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 304107) (llvm/trunk 304109)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "a.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!2 = !{}
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 5.0.0 (trunk 304107) (llvm/trunk 304109)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!4 = !DIFile(filename: "b.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!5 = !{!"clang version 5.0.0 (trunk 304107) (llvm/trunk 304109)"}
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = distinct !DISubprogram(name: "a", linkageName: "_Z1av", scope: !1, file: !1, line: 1, type: !10, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!10 = !DISubroutineType(types: !11)
+!11 = !{null}
+!12 = !DILocation(line: 2, column: 1, scope: !9)
+!13 = distinct !DISubprogram(name: "b", linkageName: "_Z1bv", scope: !4, file: !4, line: 1, type: !10, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !3, variables: !2)
+!14 = !DILocation(line: 2, column: 1, scope: !13)
diff --git a/test/DebugInfo/X86/split-dwarf-omit-empty.ll b/test/DebugInfo/X86/split-dwarf-omit-empty.ll
new file mode 100644
index 000000000000..5f824bd43cd3
--- /dev/null
+++ b/test/DebugInfo/X86/split-dwarf-omit-empty.ll
@@ -0,0 +1,54 @@
+; RUN: %llc_dwarf -split-dwarf-file=foo.dwo  %s -filetype=obj -o - | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Created from:
+; a.cpp:
+;   void f1();
+;   inline __attribute__((always_inline)) __attribute__((used)) void f2() { f1(); }
+; b.cpp:
+;   void f2();
+;   void f3() {
+;     f2();
+;   }
+; $ clang++ -fno-split-dwarf-inlining {a,b}.cpp -emit-llvm -S -g
+; $ llvm-link {a,b}.ll -S -o ab.ll
+; Then strip out the @llvm.used global, so no out of line definition of 'f2'
+; will be emitted. This emulates something more like the available_externally
+; import performed by ThinLTO.
+
+; CHECK: Compile Unit
+; CHECK-NOT: Compile Unit
+
+target triple = "x86_64-pc-linux"
+
+declare void @_Z2f1v()
+
+; Function Attrs: noinline norecurse uwtable
+define i32 @main() !dbg !9 {
+entry:
+  call void @_Z2f1v(), !dbg !13
+  ret i32 0, !dbg !18
+}
+
+!llvm.dbg.cu = !{!0, !3}
+!llvm.ident = !{!5, !5}
+!llvm.module.flags = !{!6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 304054) (llvm/trunk 304080)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false)
+!1 = !DIFile(filename: "a.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!2 = !{}
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 5.0.0 (trunk 304054) (llvm/trunk 304080)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false)
+!4 = !DIFile(filename: "b.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!5 = !{!"clang version 5.0.0 (trunk 304054) (llvm/trunk 304080)"}
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = distinct !DISubprogram(name: "main", scope: !4, file: !4, line: 2, type: !10, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !3, variables: !2)
+!10 = !DISubroutineType(types: !11)
+!11 = !{!12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !DILocation(line: 2, column: 73, scope: !14, inlinedAt: !17)
+!14 = distinct !DISubprogram(name: "f2", linkageName: "_Z2f2v", scope: !1, file: !1, line: 2, type: !15, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = distinct !DILocation(line: 3, column: 3, scope: !9)
+!18 = !DILocation(line: 4, column: 1, scope: !9)
diff --git a/test/DebugInfo/dwo.ll b/test/DebugInfo/dwo.ll
index 5eeca541e78d..b6de943c0239 100644
--- a/test/DebugInfo/dwo.ll
+++ b/test/DebugInfo/dwo.ll
@@ -8,8 +8,10 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2, dwoId: 43981)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, enums: !2, retainedTypes: !5, globals: !2, imports: !2, dwoId: 43981)
 !1 = !DIFile(filename: "<stdin>", directory: "/")
 !2 = !{}
 !3 = !{i32 2, !"Dwarf Version", i32 4}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/DebugInfo/llvm-symbolizer.test b/test/DebugInfo/llvm-symbolizer.test
index f0db8f4b921f..2c64804659fc 100644
--- a/test/DebugInfo/llvm-symbolizer.test
+++ b/test/DebugInfo/llvm-symbolizer.test
@@ -27,6 +27,9 @@ RUN: cp %p/Inputs/split-dwarf-multiple-cu.dwo %T
 RUN: echo "%p/Inputs/split-dwarf-multiple-cu.o 0x4" >> %t.input
 RUN: cp %p/Inputs/split-dwarf-addr-object-relocation.dwo %T
 RUN: echo "%p/Inputs/split-dwarf-addr-object-relocation.o 0x14" >> %t.input
+RUN: cp %p/Inputs/split-dwarf-dwp.o %T
+RUN: cp %p/Inputs/split-dwarf-dwp.o.dwp %T
+RUN: echo "%T/split-dwarf-dwp.o 0x4" >> %t.input
 
 RUN: llvm-symbolizer --functions=linkage --inlining --demangle=false \
 RUN:    --default-arch=i386 < %t.input | FileCheck --check-prefix=CHECK --check-prefix=SPLIT --check-prefix=DWO %s
@@ -147,6 +150,11 @@ CHECK-NEXT: split-dwarf-addr-object-relocation.cpp:3:3
 CHECK-NEXT: f3
 CHECK-NEXT: split-dwarf-addr-object-relocation.cpp:6:0
 
+CHECK:      f2
+CHECK-NEXT: split-dwarf-dwp.cpp:3:3
+CHECK-NEXT: f3
+CHECK-NEXT: split-dwarf-dwp.cpp:6:0
+
 RUN: echo "unexisting-file 0x1234" > %t.input2
 RUN: llvm-symbolizer < %t.input2 2>&1 | FileCheck %s --check-prefix=MISSING-FILE
 
diff --git a/test/DebugInfo/omit-empty.ll b/test/DebugInfo/omit-empty.ll
new file mode 100644
index 000000000000..92450050d208
--- /dev/null
+++ b/test/DebugInfo/omit-empty.ll
@@ -0,0 +1,12 @@
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-objdump -h - | FileCheck %s
+
+; CHECK-NOT: .debug_
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/DebugInfo/skeletoncu.ll b/test/DebugInfo/skeletoncu.ll
index a90c8b355ce2..4c96d82036e7 100644
--- a/test/DebugInfo/skeletoncu.ll
+++ b/test/DebugInfo/skeletoncu.ll
@@ -8,9 +8,11 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, splitDebugFilename: "my.dwo", emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2, dwoId: 43981)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, splitDebugFilename: "my.dwo", emissionKind: FullDebug, enums: !2, retainedTypes: !6, globals: !2, imports: !2, dwoId: 43981)
 !1 = !DIFile(filename: "<stdin>", directory: "/")
 !2 = !{}
 !3 = !{i32 2, !"Dwarf Version", i32 4}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!6 = !{!5}
 
diff --git a/test/ExecutionEngine/MCJIT/lit.local.cfg b/test/ExecutionEngine/MCJIT/lit.local.cfg
index f98140357736..e2535ef1dbfd 100644
--- a/test/ExecutionEngine/MCJIT/lit.local.cfg
+++ b/test/ExecutionEngine/MCJIT/lit.local.cfg
@@ -9,7 +9,8 @@ else:
 # FIXME: autoconf and cmake produce different arch names. We should normalize
 # them before getting here.
 if root.host_arch not in ['i386', 'x86', 'x86_64', 'AMD64',
-                          'AArch64', 'ARM', 'Mips', 'PowerPC', 'ppc64', 'SystemZ']:
+                          'AArch64', 'ARM', 'Mips',
+                          'PowerPC', 'ppc64', 'ppc64le', 'SystemZ']:
     config.unsupported = True
 
 if 'armv7' in root.host_arch:
diff --git a/test/ExecutionEngine/OrcMCJIT/lit.local.cfg b/test/ExecutionEngine/OrcMCJIT/lit.local.cfg
index f98140357736..e2535ef1dbfd 100644
--- a/test/ExecutionEngine/OrcMCJIT/lit.local.cfg
+++ b/test/ExecutionEngine/OrcMCJIT/lit.local.cfg
@@ -9,7 +9,8 @@ else:
 # FIXME: autoconf and cmake produce different arch names. We should normalize
 # them before getting here.
 if root.host_arch not in ['i386', 'x86', 'x86_64', 'AMD64',
-                          'AArch64', 'ARM', 'Mips', 'PowerPC', 'ppc64', 'SystemZ']:
+                          'AArch64', 'ARM', 'Mips',
+                          'PowerPC', 'ppc64', 'ppc64le', 'SystemZ']:
     config.unsupported = True
 
 if 'armv7' in root.host_arch:
diff --git a/test/ExecutionEngine/OrcMCJIT/pr32650.ll b/test/ExecutionEngine/OrcMCJIT/pr32650.ll
new file mode 100644
index 000000000000..bbf68aea512a
--- /dev/null
+++ b/test/ExecutionEngine/OrcMCJIT/pr32650.ll
@@ -0,0 +1,28 @@
+; RUN: %lli -jit-kind=orc-mcjit %s
+
+; This test is intended to verify that a function weakly defined in
+; JITted code, and strongly defined in the main executable, can be
+; correctly resolved when called from elsewhere in JITted code.
+
+; This test makes the assumption that the lli executable in compiled
+; to export symbols (e.g. --export-dynamic), and that is actually does
+; contain the symbol LLVMInitializeCodeGen.  (Note that this function
+; is not actually called by the test.  The test simply verifes that
+; the reference can be resolved without relocation errors.)
+
+define linkonce_odr void @LLVMInitializeCodeGen() {
+entry:
+  ret void
+}
+
+define void @test() {
+entry:
+  call void @LLVMInitializeCodeGen()
+  ret void
+}
+
+define i32 @main() {
+entry:
+  ret i32 0
+}
+
diff --git a/test/Feature/fp-intrinsics.ll b/test/Feature/fp-intrinsics.ll
index 960bfb5ca105..f21ba15b2d49 100644
--- a/test/Feature/fp-intrinsics.ll
+++ b/test/Feature/fp-intrinsics.ll
@@ -95,8 +95,156 @@ if.end:
 }
 
 
+; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f5
+; CHECK: call double @llvm.experimental.constrained.sqrt
+define double @f5() {
+entry:
+  %result = call double @llvm.experimental.constrained.sqrt.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that pow(42.1, 3.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f6
+; CHECK: call double @llvm.experimental.constrained.pow
+define double @f6() {
+entry:
+  %result = call double @llvm.experimental.constrained.pow.f64(double 42.1,
+                                               double 3.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that powi(42.1, 3) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f7
+; CHECK: call double @llvm.experimental.constrained.powi
+define double @f7() {
+entry:
+  %result = call double @llvm.experimental.constrained.powi.f64(double 42.1,
+                                               i32 3,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that sin(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f8
+; CHECK: call double @llvm.experimental.constrained.sin
+define double @f8() {
+entry:
+  %result = call double @llvm.experimental.constrained.sin.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that cos(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f9
+; CHECK: call double @llvm.experimental.constrained.cos
+define double @f9() {
+entry:
+  %result = call double @llvm.experimental.constrained.cos.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that exp(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f10
+; CHECK: call double @llvm.experimental.constrained.exp
+define double @f10() {
+entry:
+  %result = call double @llvm.experimental.constrained.exp.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that exp2(42.1) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f11
+; CHECK: call double @llvm.experimental.constrained.exp2
+define double @f11() {
+entry:
+  %result = call double @llvm.experimental.constrained.exp2.f64(double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that log(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f12
+; CHECK: call double @llvm.experimental.constrained.log
+define double @f12() {
+entry:
+  %result = call double @llvm.experimental.constrained.log.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that log10(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f13
+; CHECK: call double @llvm.experimental.constrained.log10
+define double @f13() {
+entry:
+  %result = call double @llvm.experimental.constrained.log10.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that log2(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f14
+; CHECK: call double @llvm.experimental.constrained.log2
+define double @f14() {
+entry:
+  %result = call double @llvm.experimental.constrained.log2.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that rint(42.1) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f15
+; CHECK: call double @llvm.experimental.constrained.rint
+define double @f15() {
+entry:
+  %result = call double @llvm.experimental.constrained.rint.f64(double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that nearbyint(42.1) isn't simplified when the rounding mode is
+; unknown.
+; CHECK-LABEL: f16
+; CHECK: call double @llvm.experimental.constrained.nearbyint
+define double @f16() {
+entry:
+  %result = call double @llvm.experimental.constrained.nearbyint.f64(
+                                               double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
 @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata"
 declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata)
+declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.exp.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.exp2.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log10.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata)
diff --git a/test/Instrumentation/SanitizerCoverage/chains.ll b/test/Instrumentation/SanitizerCoverage/chains.ll
new file mode 100644
index 000000000000..86b109165ee5
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/chains.ll
@@ -0,0 +1,33 @@
+; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=1  -S | FileCheck %s
+
+define i32 @blah(i32) #0 {
+  %2 = icmp sgt i32 %0, 1
+  br i1 %2, label %branch, label %exit
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+branch:
+  br label %pos2
+; CHECK-LABEL: branch:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+pos2:
+  br label %pos3
+; CHECK-LABEL: pos2:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+pos3:
+  br label %pos4
+; CHECK-LABEL: pos3:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+pos4:
+  ret i32 0
+; CHECK-LABEL: pos4:
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+exit:
+  ret i32 0
+; CHECK-LABEL: exit:
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+}
diff --git a/test/Instrumentation/SanitizerCoverage/postdominator_check.ll b/test/Instrumentation/SanitizerCoverage/postdominator_check.ll
new file mode 100644
index 000000000000..c50d663eff82
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/postdominator_check.ll
@@ -0,0 +1,85 @@
+; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=1 -S | FileCheck %s
+; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=0 -S | FileCheck %s --check-prefix=CHECK_NO_PRUNE
+
+define i32 @foo(i32) #0 {
+  %2 = icmp sgt i32 %0, 0
+  br i1 %2, label %left, label %right
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+left:
+  %3 = icmp sgt i32 %0, 10
+  br i1 %3, label %left_left, label %left_right
+; CHECK-LABEL: left:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: left:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+left_left:
+  br label %left_join
+; CHECK-LABEL: left_left:
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: left_left:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+left_right:
+  br label %left_join
+; CHECK-LABEL: left_right:
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: left_right:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+left_join:
+  br label %finish
+; CHECK-LABEL: left_join:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: left_join:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+right:
+  %4 = icmp sgt i32 %0, 10
+  br i1 %4, label %right_left, label %right_right
+; CHECK-LABEL: right:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: right:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+right_left:
+  br label %right_join
+; CHECK-LABEL: right_left:
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: right_left:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+right_right:
+  br label %right_join
+; CHECK-LABEL: right_right:
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: right_right:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+right_join:
+  br label %finish
+; CHECK-LABEL: right_join:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: right_join:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+finish:
+  ret i32 %0
+; CHECK-LABEL: finish:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: finish:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+}
diff --git a/test/LTO/Resolution/X86/linkonce.ll b/test/LTO/Resolution/X86/linkonce.ll
new file mode 100644
index 000000000000..33d2df740a4d
--- /dev/null
+++ b/test/LTO/Resolution/X86/linkonce.ll
@@ -0,0 +1,11 @@
+; RUN: opt -module-summary -o %t %s
+; RUN: llvm-lto2 run %t -O0 -r %t,foo,px -o %t2
+; RUN: llvm-nm %t2.0 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: W foo
+define linkonce_odr void @foo() {
+  ret void
+}
diff --git a/test/LTO/Resolution/X86/type-checked-load.ll b/test/LTO/Resolution/X86/type-checked-load.ll
new file mode 100644
index 000000000000..3ec783bc5653
--- /dev/null
+++ b/test/LTO/Resolution/X86/type-checked-load.ll
@@ -0,0 +1,16 @@
+; RUN: llvm-as -o %t %s
+; RUN: llvm-lto2 run %t -O0 -r %t,foo,px -o %t2
+
+; This just tests that we don't crash when compiling this test case.
+; It means that the wholeprogramdevirt pass must have run and lowered
+; the llvm.type.checked.load call.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define {i8*, i1} @foo(i8* %ptr) {
+  %pair = call {i8*, i1} @llvm.type.checked.load(i8* %ptr, i32 16, metadata !"foo")
+  ret {i8*, i1} %pair
+}
+
+declare {i8*, i1} @llvm.type.checked.load(i8* %ptr, i32 %offset, metadata %type)
diff --git a/test/Linker/Inputs/module-flags-pic-2-b.ll b/test/Linker/Inputs/module-flags-pic-2-b.ll
index 0d78cafc6a0f..f652eddb3842 100644
--- a/test/Linker/Inputs/module-flags-pic-2-b.ll
+++ b/test/Linker/Inputs/module-flags-pic-2-b.ll
@@ -1,3 +1,4 @@
-!0 = !{ i32 1, !"PIC Level", i32 2 }
+!0 = !{ i32 7, !"PIC Level", i32 2 }
+!1 = !{ i32 7, !"PIE Level", i32 2 }
 
-!llvm.module.flags = !{!0}
+!llvm.module.flags = !{!0, !1}
diff --git a/test/Linker/module-flags-pic-2-a.ll b/test/Linker/module-flags-pic-2-a.ll
index e09af6bcd128..8898d72d5101 100644
--- a/test/Linker/module-flags-pic-2-a.ll
+++ b/test/Linker/module-flags-pic-2-a.ll
@@ -1,10 +1,11 @@
-; RUN: not llvm-link %s %p/Inputs/module-flags-pic-2-b.ll -S -o - 2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+; RUN: llvm-link %s %p/Inputs/module-flags-pic-2-b.ll -S -o - | FileCheck %s
 
-; test linking modules with two different PIC levels
+; test linking modules with two different PIC and PIE levels
 
-!0 = !{ i32 1, !"PIC Level", i32 1 }
+!0 = !{ i32 7, !"PIC Level", i32 1 }
+!1 = !{ i32 7, !"PIE Level", i32 1 }
 
-!llvm.module.flags = !{!0}
+!llvm.module.flags = !{!0, !1}
 
-; CHECK-ERRORS: ERROR: linking module flags 'PIC Level': IDs have conflicting values
+; CHECK: !0 = !{i32 7, !"PIC Level", i32 2}
+; CHECK: !1 = !{i32 7, !"PIE Level", i32 2}
diff --git a/test/MC/AMDGPU/vop_sdwa.s b/test/MC/AMDGPU/vop_sdwa.s
index 75db3259f43c..59dd30ed0e48 100644
--- a/test/MC/AMDGPU/vop_sdwa.s
+++ b/test/MC/AMDGPU/vop_sdwa.s
@@ -1,42 +1,42 @@
-// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=VI --check-prefix=GFX89
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX9 --check-prefix=GFX89
 
 // RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI
 // RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI
 // RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOVI
-
-// ToDo: intrinsics
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOVI --check-prefix=NOGFX89
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOGFX9 --check-prefix=NOGFX89
 
 //---------------------------------------------------------------------------//
 // Check SDWA operands
 //---------------------------------------------------------------------------//
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x02,0x10,0x06,0x06]
+// GFX89: v_mov_b32_sdwa v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x02,0x10,0x06,0x06]
 v_mov_b32 v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x06,0x7e,0x04,0x11,0x05,0x06]
+// GFX89: v_mov_b32_sdwa v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x06,0x7e,0x04,0x11,0x05,0x06]
 v_mov_b32 v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v15, v99 dst_sel:BYTE_2 dst_unused:UNUSED_SEXT src0_sel:WORD_0 ; encoding: [0xf9,0x02,0x1e,0x7e,0x63,0x0a,0x04,0x06]
+// GFX89: v_mov_b32_sdwa v15, v99 dst_sel:BYTE_2 dst_unused:UNUSED_SEXT src0_sel:WORD_0 ; encoding: [0xf9,0x02,0x1e,0x7e,0x63,0x0a,0x04,0x06]
 v_mov_b32 v15, v99 dst_sel:BYTE_2 dst_unused:UNUSED_SEXT src0_sel:WORD_0
 
 // NOSICI: error:
-// VI: v_min_u32_sdwa v194, v13, v1 dst_sel:BYTE_3 dst_unused:UNUSED_SEXT src0_sel:BYTE_3 src1_sel:BYTE_2 ; encoding: [0xf9,0x02,0x84,0x1d,0x0d,0x0b,0x03,0x02]
+// GFX89: v_min_u32_sdwa v194, v13, v1 dst_sel:BYTE_3 dst_unused:UNUSED_SEXT src0_sel:BYTE_3 src1_sel:BYTE_2 ; encoding: [0xf9,0x02,0x84,0x1d,0x0d,0x0b,0x03,0x02]
 v_min_u32 v194, v13, v1 dst_sel:BYTE_3 dst_unused:UNUSED_SEXT src0_sel:BYTE_3 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_u32_sdwa v255, v4, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:WORD_1 ; encoding: [0xf9,0x02,0xfe,0x1d,0x04,0x04,0x02,0x05]
+// GFX89: v_min_u32_sdwa v255, v4, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:WORD_1 ; encoding: [0xf9,0x02,0xfe,0x1d,0x04,0x04,0x02,0x05]
 v_min_u32 v255, v4, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_min_u32_sdwa v200, v200, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x90,0x1d,0xc8,0x05,0x01,0x06]
+// GFX89: v_min_u32_sdwa v200, v200, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x90,0x1d,0xc8,0x05,0x01,0x06]
 v_min_u32 v200, v200, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 
 // NOSICI: error:
-// VI: v_min_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x1c,0x01,0x06,0x00,0x06]
+// GFX89: v_min_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x1c,0x01,0x06,0x00,0x06]
 v_min_u32 v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 
 //---------------------------------------------------------------------------//
@@ -44,43 +44,43 @@ v_min_u32 v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_se
 //---------------------------------------------------------------------------//
 
 // NOSICI: error:
-// VI: v_cvt_u32_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x0e,0x00,0x7e,0x00,0x36,0x06,0x06]
+// GFX89: v_cvt_u32_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x0e,0x00,0x7e,0x00,0x36,0x06,0x06]
 v_cvt_u32_f32 v0, v0 clamp dst_sel:DWORD
 
 // NOSICI: error:
-// VI: v_fract_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x26,0x06,0x06]
+// GFX89: v_fract_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x26,0x06,0x06]
 v_fract_f32 v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD
 
 // NOSICI: error:
-// VI: v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x05,0x06]
 v_sin_f32 v0, v0 dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v1, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x36,0x05,0x06]
+// GFX89: v_mov_b32_sdwa v1, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x36,0x05,0x06]
 v_mov_b32 v1, v0 clamp src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_trunc_f32_sdwa v1, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x38,0x02,0x7e,0x00,0x36,0x05,0x06]
+// GFX89: v_trunc_f32_sdwa v1, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x38,0x02,0x7e,0x00,0x36,0x05,0x06]
 v_trunc_f32 v1, v0 clamp dst_sel:DWORD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x06,0x06]
+// GFX89: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x06,0x06]
 v_mov_b32_sdwa v1, v0
 
 // NOSICI: error:
-// VI: v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x05,0x06]
+// GFX89: v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x05,0x06]
 v_add_f32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_min_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x36,0x06,0x02]
+// GFX89: v_min_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x36,0x06,0x02]
 v_min_f32 v0, v0, v0 clamp dst_sel:DWORD src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x02]
+// GFX89: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x02]
 v_and_b32 v0, v0, v0 dst_unused:UNUSED_PAD src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_i32_i24_sdwa v1, v2, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x06,0x02,0x0c,0x02,0x36,0x06,0x06]
+// GFX89: v_mul_i32_i24_sdwa v1, v2, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x06,0x02,0x0c,0x02,0x36,0x06,0x06]
 v_mul_i32_i24_sdwa v1, v2, v3 clamp
 
 //===----------------------------------------------------------------------===//
@@ -88,255 +88,256 @@ v_mul_i32_i24_sdwa v1, v2, v3 clamp
 //===----------------------------------------------------------------------===//
 
 // NOSICI: error:
-// VI: v_fract_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x25,0x06]
+// GFX89: v_fract_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x25,0x06]
 v_fract_f32 v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_sin_f32_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x35,0x06]
+// GFX89: v_sin_f32_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x35,0x06]
 v_sin_f32 v0, -abs(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_add_f32_sdwa v0, -|v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x35,0x12]
+// GFX89: v_add_f32_sdwa v0, -|v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x35,0x12]
 v_add_f32 v0, -|v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_f32_sdwa v0, |v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x25,0x12]
+// GFX89: v_min_f32_sdwa v0, |v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x25,0x12]
 v_min_f32 v0, abs(v0), -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x0e,0x06]
+// GFX89: v_mov_b32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x0e,0x06]
 v_mov_b32_sdwa v1, sext(v0)
 
 // NOSICI: error:
-// VI: v_and_b32_sdwa v0, sext(v0), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x0e,0x0a]
+// GFX89: v_and_b32_sdwa v0, sext(v0), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x0e,0x0a]
 v_and_b32 v0, sext(v0), sext(v0) dst_unused:UNUSED_PAD src1_sel:BYTE_2
 
 // NOSICI: error:
 // VI: v_cmp_class_f32 vcc, -v1, sext(v2) src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x12,0x0c]
-v_cmp_class_f32 vcc, -v1, sext(v2) src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_class_f32_sdwa vcc, -v1, sext(v2) src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x20,0x7c,0x01,0x00,0x12,0x0c]
+v_cmp_class_f32_sdwa vcc, -v1, sext(v2) src0_sel:BYTE_2 src1_sel:WORD_0
 
 //===----------------------------------------------------------------------===//
 // Check VOP1 opcodes
 //===----------------------------------------------------------------------===//
 
 // NOSICI: error:
-// VI: v_nop ; encoding: [0xf9,0x00,0x00,0x7e,0x00,0x16,0x06,0x06]
+// GFX89: v_nop ; encoding: [0xf9,0x00,0x00,0x7e,0x00,0x16,0x06,0x06]
 v_nop_sdwa
 
 // NOSICI: error:
-// VI: v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_u32_f32 v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x05,0x06]
 v_fract_f32 v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x05,0x06]
 v_sin_f32 v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_mov_b32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_i32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0a,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_i32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0a,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_i32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0c,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0c,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_u32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x10,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x10,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_i32_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f16_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x14,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f16_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x14,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f16_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x16,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x16,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_rpi_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x18,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_rpi_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x18,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_rpi_i32_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_flr_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x1a,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_flr_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x1a,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_flr_i32_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_off_f32_i4_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x1c,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_off_f32_i4_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x1c,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_off_f32_i4 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x22,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x22,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_ubyte0 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_ubyte1_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x24,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_ubyte1_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x24,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_ubyte1 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_ubyte2_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x26,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_ubyte2_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x26,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_ubyte2 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_ubyte3_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x28,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_ubyte3_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x28,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_ubyte3 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_trunc_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x38,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_trunc_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x38,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_trunc_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_ceil_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3a,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_ceil_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3a,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_ceil_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rndne_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3c,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rndne_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3c,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rndne_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_floor_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3e,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_floor_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3e,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_floor_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_exp_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x40,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_exp_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x40,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_exp_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_log_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x42,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_log_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x42,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_log_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rcp_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x44,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rcp_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x44,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rcp_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rcp_iflag_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x46,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rcp_iflag_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x46,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rcp_iflag_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rsq_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x48,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rsq_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x48,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rsq_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_sqrt_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x4e,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_sqrt_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x4e,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_sqrt_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cos_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x54,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cos_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x54,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cos_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_not_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x56,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_not_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x56,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_not_b32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_bfrev_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x58,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_bfrev_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x58,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_bfrev_b32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5a,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5a,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_ffbh_u32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_ffbl_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5c,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_ffbl_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5c,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_ffbl_b32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_ffbh_i32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5e,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_ffbh_i32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5e,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_ffbh_i32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_frexp_exp_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x66,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_frexp_exp_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x66,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_frexp_exp_i32_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_frexp_mant_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x68,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_frexp_mant_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x68,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_frexp_mant_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_log_legacy_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x98,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_log_legacy_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x98,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_log_legacy_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_exp_legacy_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x96,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_exp_legacy_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x96,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_exp_legacy_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f16_u16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x72,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f16_u16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x72,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f16_u16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f16_i16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x74,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f16_i16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x74,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f16_i16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_u16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x76,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_u16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x76,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_u16_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x78,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x78,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_i16_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7a,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7a,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rcp_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7c,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7c,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_sqrt_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7e,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7e,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rsq_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x80,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x80,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_log_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x82,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x82,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_exp_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x84,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x84,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_frexp_mant_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x86,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x86,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_frexp_exp_i16_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_floor_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x88,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_floor_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x88,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_floor_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_ceil_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8a,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_ceil_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8a,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_ceil_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_trunc_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8c,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_trunc_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8c,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_trunc_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8e,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8e,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rndne_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_fract_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x90,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_fract_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x90,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_fract_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_sin_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x92,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_sin_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x92,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_sin_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cos_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x94,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cos_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x94,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cos_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 //===----------------------------------------------------------------------===//
@@ -344,195 +345,179 @@ v_cos_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 //===----------------------------------------------------------------------===//
 
 // NOSICI: error:
-// VI: v_mac_f32_sdwa v3, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x0a,0x06,0x2c,0x04,0x16,0x05,0x06]
-v_mac_f32 v3, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-
-// NOSICI: error:
-// VI: v_mac_f32_sdwa v15, v99, v194 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:WORD_0 src1_sel:DWORD ; encoding: [0xf9,0x84,0x1f,0x2c,0x63,0x0e,0x04,0x06]
-v_mac_f32 v15, v99, v194 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:WORD_0
-
-// NOSICI: error:
-// NOVI: error: invalid operand for instruction
-v_mac_f32 v194, v13, v1 dst_sel:BYTE_0 dst_unused:UNUSED_SEXT src0_sel:BYTE_3 src1_sel:BYTE_2
-
-// NOSICI: error:
-// VI: v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x05,0x02]
+// GFX89: v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x05,0x02]
 v_add_f32 v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x05,0x02]
+// GFX89: v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x05,0x02]
 v_min_f32 v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x05,0x02]
+// GFX89: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x05,0x02]
 v_and_b32 v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0c,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0c,0x02,0x06,0x05,0x02]
 v_mul_i32_i24 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_sub_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x04,0x02,0x06,0x05,0x02]
+// GFX89: v_sub_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x04,0x02,0x06,0x05,0x02]
 v_sub_f32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_subrev_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x06,0x02,0x06,0x05,0x02]
+// GFX89: v_subrev_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x06,0x02,0x06,0x05,0x02]
 v_subrev_f32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0a,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0a,0x02,0x06,0x05,0x02]
 v_mul_f32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_hi_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0e,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_hi_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0e,0x02,0x06,0x05,0x02]
 v_mul_hi_i32_i24 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_u32_u24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x10,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_u32_u24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x10,0x02,0x06,0x05,0x02]
 v_mul_u32_u24 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_hi_u32_u24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x12,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_hi_u32_u24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x12,0x02,0x06,0x05,0x02]
 v_mul_hi_u32_u24 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_max_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x16,0x02,0x06,0x05,0x02]
+// GFX89: v_max_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x16,0x02,0x06,0x05,0x02]
 v_max_f32 v1, v2 v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x18,0x02,0x06,0x05,0x02]
+// GFX89: v_min_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x18,0x02,0x06,0x05,0x02]
 v_min_i32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_max_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1a,0x02,0x06,0x05,0x02]
+// GFX89: v_max_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1a,0x02,0x06,0x05,0x02]
 v_max_i32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_u32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1c,0x02,0x06,0x05,0x02]
+// GFX89: v_min_u32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1c,0x02,0x06,0x05,0x02]
 v_min_u32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_max_u32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1e,0x02,0x06,0x05,0x02]
+// GFX89: v_max_u32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1e,0x02,0x06,0x05,0x02]
 v_max_u32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_lshrrev_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x20,0x02,0x06,0x05,0x02]
+// GFX89: v_lshrrev_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x20,0x02,0x06,0x05,0x02]
 v_lshrrev_b32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_ashrrev_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x22,0x02,0x06,0x05,0x02]
+// GFX89: v_ashrrev_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x22,0x02,0x06,0x05,0x02]
 v_ashrrev_i32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_lshlrev_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x24,0x02,0x06,0x05,0x02]
+// GFX89: v_lshlrev_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x24,0x02,0x06,0x05,0x02]
 v_lshlrev_b32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x28,0x02,0x06,0x05,0x02]
+// GFX89: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x28,0x02,0x06,0x05,0x02]
 v_or_b32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_xor_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x2a,0x02,0x06,0x05,0x02]
+// GFX89: v_xor_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x2a,0x02,0x06,0x05,0x02]
 v_xor_b32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_add_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3e,0x02,0x06,0x05,0x02]
+// GFX89: v_add_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3e,0x02,0x06,0x05,0x02]
 v_add_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_sub_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x40,0x02,0x06,0x05,0x02]
+// GFX89: v_sub_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x40,0x02,0x06,0x05,0x02]
 v_sub_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_subrev_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x42,0x02,0x06,0x05,0x02]
+// GFX89: v_subrev_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x42,0x02,0x06,0x05,0x02]
 v_subrev_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x44,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x44,0x02,0x06,0x05,0x02]
 v_mul_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mac_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x46,0x02,0x06,0x05,0x02]
-v_mac_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
-
-// NOSICI: error:
-// VI: v_add_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x4c,0x02,0x06,0x05,0x02]
+// GFX89: v_add_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x4c,0x02,0x06,0x05,0x02]
 v_add_u16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_sub_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x4e,0x02,0x06,0x05,0x02]
+// GFX89: v_sub_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x4e,0x02,0x06,0x05,0x02]
 v_sub_u16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_subrev_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x50,0x02,0x06,0x05,0x02]
+// GFX89: v_subrev_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x50,0x02,0x06,0x05,0x02]
 v_subrev_u16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_lo_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x52,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_lo_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x52,0x02,0x06,0x05,0x02]
 v_mul_lo_u16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_lshlrev_b16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x54,0x02,0x06,0x05,0x02]
+// GFX89: v_lshlrev_b16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x54,0x02,0x06,0x05,0x02]
 v_lshlrev_b16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_lshrrev_b16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x56,0x02,0x06,0x05,0x02]
+// GFX89: v_lshrrev_b16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x56,0x02,0x06,0x05,0x02]
 v_lshrrev_b16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_ashrrev_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x58,0x02,0x06,0x05,0x02]
+// GFX89: v_ashrrev_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x58,0x02,0x06,0x05,0x02]
 v_ashrrev_i16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_max_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5a,0x02,0x06,0x05,0x02]
+// GFX89: v_max_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5a,0x02,0x06,0x05,0x02]
 v_max_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5c,0x02,0x06,0x05,0x02]
+// GFX89: v_min_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5c,0x02,0x06,0x05,0x02]
 v_min_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_max_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5e,0x02,0x06,0x05,0x02]
+// GFX89: v_max_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5e,0x02,0x06,0x05,0x02]
 v_max_u16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_max_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x60,0x02,0x06,0x05,0x02]
+// GFX89: v_max_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x60,0x02,0x06,0x05,0x02]
 v_max_i16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x62,0x02,0x06,0x05,0x02]
+// GFX89: v_min_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x62,0x02,0x06,0x05,0x02]
 v_min_u16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x64,0x02,0x06,0x05,0x02]
+// GFX89: v_min_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x64,0x02,0x06,0x05,0x02]
 v_min_i16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_ldexp_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x66,0x02,0x06,0x05,0x02]
+// GFX89: v_ldexp_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x66,0x02,0x06,0x05,0x02]
 v_ldexp_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_add_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x32,0x02,0x06,0x05,0x02]
+// GFX89: v_add_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x32,0x02,0x06,0x05,0x02]
 v_add_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_sub_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x34,0x02,0x06,0x05,0x02]
+// GFX89: v_sub_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x34,0x02,0x06,0x05,0x02]
 v_sub_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_subrev_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x36,0x02,0x06,0x05,0x02]
+// GFX89: v_subrev_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x36,0x02,0x06,0x05,0x02]
 v_subrev_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_addc_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x38,0x02,0x06,0x05,0x02]
+// GFX89: v_addc_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x38,0x02,0x06,0x05,0x02]
 v_addc_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_subb_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3a,0x02,0x06,0x05,0x02]
+// GFX89: v_subb_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3a,0x02,0x06,0x05,0x02]
 v_subb_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_subbrev_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3c,0x02,0x06,0x05,0x02]
+// GFX89: v_subbrev_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3c,0x02,0x06,0x05,0x02]
 v_subbrev_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 //===----------------------------------------------------------------------===//
@@ -541,92 +526,210 @@ v_subbrev_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0
 
 // NOSICI: error:
 // VI: v_cmp_eq_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x02,0x04]
-v_cmp_eq_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x00,0x02,0x04]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmp_nle_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x02,0x04]
-v_cmp_nle_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x98,0x7c,0x01,0x00,0x02,0x04]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_gt_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x02,0x04]
-v_cmpx_gt_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xa8,0x7c,0x01,0x00,0x02,0x04]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_nlt_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x02,0x04]
-v_cmpx_nlt_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xbc,0x7c,0x01,0x00,0x02,0x04]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmp_lt_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x02,0x04]
-v_cmp_lt_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x82,0x7d,0x01,0x00,0x02,0x04]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmp_t_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x02,0x04]
-v_cmp_t_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x8e,0x7d,0x01,0x00,0x02,0x04]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_eq_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x02,0x04]
-v_cmpx_eq_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xa4,0x7d,0x01,0x00,0x02,0x04]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_ne_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x02,0x04]
-v_cmpx_ne_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xaa,0x7d,0x01,0x00,0x02,0x04]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmp_f_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x02,0x04]
-v_cmp_f_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x90,0x7d,0x01,0x00,0x02,0x04]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmp_gt_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x02,0x04]
-v_cmp_gt_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x98,0x7d,0x01,0x00,0x02,0x04]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_le_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x02,0x04]
-v_cmpx_le_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xb6,0x7d,0x01,0x00,0x02,0x04]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_ne_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x02,0x04]
-v_cmpx_ne_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xba,0x7d,0x01,0x00,0x02,0x04]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmp_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x02,0x04]
-v_cmp_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x20,0x7c,0x01,0x00,0x02,0x04]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x02,0x04]
-v_cmpx_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x22,0x7c,0x01,0x00,0x02,0x04]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 //===----------------------------------------------------------------------===//
-// Check that immideates and scalar regs are not supported
+// Check that immideates are not supported
 //===----------------------------------------------------------------------===//
 
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
+// NOV9: error: invalid operand for instruction
 v_mov_b32 v0, 1 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
+// NOGFX89: error: invalid operand for instruction
 v_and_b32 v0, 42, v1 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
+// NOGFX89: error: invalid operand for instruction
 v_add_f32 v0, v1, 345 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
+// NOGFX89: error: invalid operand for instruction
 v_cmpx_class_f32 vcc, -1, 200 src0_sel:BYTE_2 src1_sel:WORD_0
 
+//===----------------------------------------------------------------------===//
+// Check GFX9-specific SDWA features
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// v_mac_f16/f32 is prohibited
+//===----------------------------------------------------------------------===//
+
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
-v_mov_b32 v0, s1 src0_sel:BYTE_2 src1_sel:WORD_0
+// VI: v_mac_f32_sdwa v3, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x0a,0x06,0x2c,0x04,0x16,0x05,0x06]
+// NOGFX9: error: instruction not supported on this GPU
+v_mac_f32 v3, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
-v_and_b32 v0, s42, v1 src0_sel:BYTE_2 src1_sel:WORD_0
+// VI: v_mac_f32_sdwa v15, v99, v194 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:WORD_0 src1_sel:DWORD ; encoding: [0xf9,0x84,0x1f,0x2c,0x63,0x0e,0x04,0x06]
+// NOGFX9: error: instruction not supported on this GPU
+v_mac_f32 v15, v99, v194 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:WORD_0
 
 // NOSICI: error:
 // NOVI: error: invalid operand for instruction
-v_add_f32 v0, v1, s45 src0_sel:BYTE_2 src1_sel:WORD_0
+// NOGFX9: error: instruction not supported on this GPU
+v_mac_f32 v194, v13, v1 dst_sel:BYTE_0 dst_unused:UNUSED_SEXT src0_sel:BYTE_3 src1_sel:BYTE_2
 
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
-v_cmpx_class_f32 vcc, s1, s2 src0_sel:BYTE_2 src1_sel:WORD_0
+// VI: v_mac_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x46,0x02,0x06,0x05,0x02]
+// NOGFX9: error: instruction not supported on this GPU
+v_mac_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
+
+//===----------------------------------------------------------------------===//
+// Scalar registers are allowed
+//===----------------------------------------------------------------------===//
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_mov_b32_sdwa v1, s2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x02,0x10,0x86,0x06]
+v_mov_b32 v1, s2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_mov_b32_sdwa v1, exec dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x7e,0x10,0x86,0x06]
+v_mov_b32 v1, exec dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_add_f32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x85,0x02]
+v_add_f32 v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_add_f32_sdwa v0, v0, s22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x2c,0x00,0x02,0x00,0x06,0x05,0x82]
+v_add_f32 v0, v0, s22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// NO: invalid operand (violates constant bus restrictions)
+v_add_f32 v0, exec, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_cmp_eq_f32_sdwa vcc, s1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x00,0x85,0x02]
+v_cmp_eq_f32_sdwa vcc, s1, v2 src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_cmp_eq_f32_sdwa vcc, v1, s22 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x2c,0x84,0x7c,0x01,0x00,0x05,0x82]
+v_cmp_eq_f32_sdwa vcc, v1, s22 src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// NOGFX9: error: invalid operand (violates constant bus restrictions)
+v_cmp_eq_f32_sdwa vcc, exec, vcc src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_ceil_f16_sdwa v5, flat_scratch_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0x66,0x06,0x86,0x06]
+v_ceil_f16_sdwa v5, flat_scratch_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+
+//===----------------------------------------------------------------------===//
+// VOPC with arbitrary SGPR destination
+//===----------------------------------------------------------------------===//
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_cmp_eq_f32_sdwa s[2:3], v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x82,0x05,0x02]
+v_cmp_eq_f32_sdwa s[2:3], v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_cmp_eq_f32_sdwa exec, v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0xfe,0x05,0x02]
+v_cmp_eq_f32_sdwa exec, v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_cmp_eq_f32_sdwa exec, s2, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x02,0xfe,0x85,0x02]
+v_cmp_eq_f32_sdwa exec, s2, v2 src0_sel:WORD_1 src1_sel:BYTE_2
+
+//===----------------------------------------------------------------------===//
+// OMod output modifier allowed
+//===----------------------------------------------------------------------===//
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_trunc_f32_sdwa v1, v2 mul:2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x38,0x02,0x7e,0x02,0x50,0x06,0x06]
+v_trunc_f32 v1, v2 mul:2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_trunc_f32_sdwa v1, v2 clamp div:2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x38,0x02,0x7e,0x02,0xf0,0x06,0x06]
+v_trunc_f32 v1, v2 clamp div:2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_add_f32_sdwa v0, v0, v0 mul:2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x46,0x05,0x02]
+v_add_f32 v0, v0, v0 mul:2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_add_f32_sdwa v0, v0, v0 clamp div:2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0xe6,0x05,0x02]
+v_add_f32 v0, v0, v0 clamp div:2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
+\ No newline at end of file
diff --git a/test/MC/Disassembler/AMDGPU/sdwa_gfx9.txt b/test/MC/Disassembler/AMDGPU/sdwa_gfx9.txt
new file mode 100644
index 000000000000..c697ebce2650
--- /dev/null
+++ b/test/MC/Disassembler/AMDGPU/sdwa_gfx9.txt
@@ -0,0 +1,477 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -disassemble -show-encoding < %s | FileCheck %s -check-prefix=GFX9
+
+#-----------------------------------------------------------------------------#
+# Input modifiers
+#-----------------------------------------------------------------------------#
+
+# GFX9: v_fract_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x25,0x06]
+0xf9 0x36 0x00 0x7e 0x00 0x06 0x25 0x06
+
+# GFX9: v_sin_f32_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x35,0x06]
+0xf9 0x52 0x00 0x7e 0x00 0x06 0x35 0x06
+
+# GFX9: v_add_f32_sdwa v0, -|v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x35,0x12]
+0xf9 0x00 0x00 0x02 0x00 0x06 0x35 0x12
+
+# GFX9: v_min_f32_sdwa v0, |v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x25,0x12]
+0xf9 0x00 0x00 0x14 0x00 0x06 0x25 0x12
+
+#-----------------------------------------------------------------------------#
+# VOP1
+#-----------------------------------------------------------------------------#
+
+# GFX9: v_mov_b32_sdwa v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x02,0x10,0x06,0x06]
+0xf9 0x02 0x02 0x7e 0x02 0x10 0x06 0x06
+
+# GFX9: v_mov_b32_sdwa v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x06,0x7e,0x04,0x11,0x05,0x06]
+0xf9 0x02 0x06 0x7e 0x04 0x11 0x05 0x06
+
+# GFX9: v_mov_b32_sdwa v15, v99 dst_sel:BYTE_2 dst_unused:UNUSED_SEXT src0_sel:WORD_0 ; encoding: [0xf9,0x02,0x1e,0x7e,0x63,0x0a,0x04,0x06]
+0xf9 0x02 0x1e 0x7e 0x63 0x0a 0x04 0x06
+
+# GFX9: v_min_u32_sdwa v194, v13, v1 dst_sel:BYTE_3 dst_unused:UNUSED_SEXT src0_sel:BYTE_3 src1_sel:BYTE_2 ; encoding: [0xf9,0x02,0x84,0x1d,0x0d,0x0b,0x03,0x02]
+0xf9 0x02 0x84 0x1d 0x0d 0x0b 0x03 0x02
+
+# GFX9: v_min_u32_sdwa v255, v4, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:WORD_1 ; encoding: [0xf9,0x02,0xfe,0x1d,0x04,0x04,0x02,0x05]
+0xf9 0x02 0xfe 0x1d 0x04 0x04 0x02 0x05
+
+# GFX9: v_min_u32_sdwa v200, v200, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x90,0x1d,0xc8,0x05,0x01,0x06]
+0xf9 0x02 0x90 0x1d 0xc8 0x05 0x01 0x06
+
+# GFX9: v_min_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x1c,0x01,0x06,0x00,0x06]
+0xf9 0x02 0x02 0x1c 0x01 0x06 0x00 0x06
+
+# GFX9: v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x0e,0x00,0x7e,0x00,0x16,0x06,0x06]
+0xf9 0x0e 0x00 0x7e 0x00 0x16 0x06 0x06
+
+# GFX9: v_fract_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x26,0x06,0x06]
+0xf9 0x36 0x00 0x7e 0x00 0x26 0x06 0x06
+
+# GFX9: v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x52 0x00 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x05,0x06]
+0xf9 0x02 0x02 0x7e 0x00 0x16 0x05 0x06
+
+# GFX9: v_trunc_f32_sdwa v1, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x38,0x02,0x7e,0x00,0x36,0x05,0x06]
+0xf9 0x38 0x02 0x7e 0x00 0x36 0x05 0x06
+
+# GFX9: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x06,0x06]
+0xf9 0x02 0x02 0x7e 0x00 0x16 0x06 0x06
+
+# GFX9: v_nop ; encoding: [0xf9,0x00,0x00,0x7e,0x00,0x16,0x06,0x06]
+0xf9 0x00 0x00 0x7e 0x00 0x16 0x06 0x06
+
+# GFX9: v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x0e 0x00 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x36 0x00 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x52 0x00 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x02 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_i32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0a,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x0a 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0c,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x0c 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x10,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x10 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f16_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x14,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x14 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x16,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x16 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_rpi_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x18,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x18 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_flr_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x1a,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x1a 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_off_f32_i4_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x1c,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x1c 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x22,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x22 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_ubyte1_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x24,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x24 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_ubyte2_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x26,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x26 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_ubyte3_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x28,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x28 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_trunc_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x38,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x38 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_ceil_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3a,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x3a 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rndne_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3c,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x3c 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_floor_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3e,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x3e 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_exp_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x40,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x40 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_log_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x42,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x42 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rcp_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x44,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x44 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rcp_iflag_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x46,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x46 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rsq_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x48,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x48 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_sqrt_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x4e,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x4e 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cos_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x54,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x54 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_not_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x56,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x56 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_bfrev_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x58,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x58 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5a,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x5a 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_ffbl_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5c,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x5c 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_ffbh_i32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5e,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x5e 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_frexp_exp_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x66,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x66 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_frexp_mant_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x68,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x68 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_log_legacy_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x98,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x98 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_exp_legacy_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x96,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x96 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f16_u16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x72,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x72 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f16_i16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x74,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x74 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_u16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x76,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x76 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x78,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x78 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7a,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x7a 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7c,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x7c 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7e,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x7e 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x80,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x80 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x82,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x82 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x84,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x84 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x86,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x86 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_floor_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x88,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x88 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_ceil_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8a,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x8a 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_trunc_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8c,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x8c 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8e,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x8e 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_fract_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x90,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x90 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_sin_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x92,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x92 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cos_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x94,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x94 0x02 0x7e 0x00 0x06 0x05 0x06
+
+#-----------------------------------------------------------------------------#
+# VOP2
+#-----------------------------------------------------------------------------#
+
+# GFX9: v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x05,0x06]
+0xf9 0x00 0x00 0x02 0x00 0x06 0x05 0x06
+
+# GFX9: v_min_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x36,0x06,0x02]
+0xf9 0x00 0x00 0x14 0x00 0x36 0x06 0x02
+
+# GFX9: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x02]
+0xf9 0x00 0x00 0x26 0x00 0x06 0x06 0x02
+
+# GFX9: v_mul_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x06,0x02,0x0c,0x02,0x16,0x06,0x06]
+0xf9 0x06 0x02 0x0c 0x02 0x16 0x06 0x06
+
+# GFX9: v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x05,0x02]
+0xf9 0x00 0x00 0x02 0x00 0x06 0x05 0x02
+
+# GFX9: v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x05,0x02]
+0xf9 0x00 0x00 0x14 0x00 0x06 0x05 0x02
+
+# GFX9: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x05,0x02]
+0xf9 0x00 0x00 0x26 0x00 0x06 0x05 0x02
+
+# GFX9: v_mul_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0c,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x0c 0x02 0x06 0x05 0x02
+
+# GFX9: v_sub_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x04,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x04 0x02 0x06 0x05 0x02
+
+# GFX9: v_subrev_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x06,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x06 0x02 0x06 0x05 0x02
+
+# GFX9: v_mul_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0a,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x0a 0x02 0x06 0x05 0x02
+
+# GFX9: v_mul_hi_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0e,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x0e 0x02 0x06 0x05 0x02
+
+# GFX9: v_mul_u32_u24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x10,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x10 0x02 0x06 0x05 0x02
+
+# GFX9: v_mul_hi_u32_u24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x12,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x12 0x02 0x06 0x05 0x02
+
+# GFX9: v_max_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x16,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x16 0x02 0x06 0x05 0x02
+
+# GFX9: v_min_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x18,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x18 0x02 0x06 0x05 0x02
+
+# GFX9: v_max_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1a,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x1a 0x02 0x06 0x05 0x02
+
+# GFX9: v_min_u32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1c,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x1c 0x02 0x06 0x05 0x02
+
+# GFX9: v_max_u32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1e,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x1e 0x02 0x06 0x05 0x02
+
+# GFX9: v_lshrrev_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x20,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x20 0x02 0x06 0x05 0x02
+
+# GFX9: v_ashrrev_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x22,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x22 0x02 0x06 0x05 0x02
+
+# GFX9: v_lshlrev_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x24,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x24 0x02 0x06 0x05 0x02
+
+# GFX9: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x28,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x28 0x02 0x06 0x05 0x02
+
+# GFX9: v_xor_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x2a,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x2a 0x02 0x06 0x05 0x02
+
+# GFX9: v_add_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3e,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x3e 0x02 0x06 0x05 0x02
+
+# GFX9: v_sub_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x40,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x40 0x02 0x06 0x05 0x02
+
+# GFX9: v_subrev_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x42,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x42 0x02 0x06 0x05 0x02
+
+# GFX9: v_mul_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x44,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x44 0x02 0x06 0x05 0x02
+
+# GFX9: v_add_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x4c,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x4c 0x02 0x06 0x05 0x02
+
+# GFX9: v_sub_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x4e,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x4e 0x02 0x06 0x05 0x02
+
+# GFX9: v_subrev_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x50,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x50 0x02 0x06 0x05 0x02
+
+# GFX9: v_mul_lo_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x52,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x52 0x02 0x06 0x05 0x02
+
+# GFX9: v_lshlrev_b16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x54,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x54 0x02 0x06 0x05 0x02
+
+# GFX9: v_lshrrev_b16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x56,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x56 0x02 0x06 0x05 0x02
+
+# GFX9: v_ashrrev_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x58,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x58 0x02 0x06 0x05 0x02
+
+# GFX9: v_max_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5a,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x5a 0x02 0x06 0x05 0x02
+
+# GFX9: v_min_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5c,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x5c 0x02 0x06 0x05 0x02
+
+# GFX9: v_max_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5e,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x5e 0x02 0x06 0x05 0x02
+
+# GFX9: v_max_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x60,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x60 0x02 0x06 0x05 0x02
+
+# GFX9: v_min_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x62,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x62 0x02 0x06 0x05 0x02
+
+# GFX9: v_min_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x64,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x64 0x02 0x06 0x05 0x02
+
+# GFX9: v_ldexp_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x66,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x66 0x02 0x06 0x05 0x02
+
+#-----------------------------------------------------------------------------#
+# VOPC
+#-----------------------------------------------------------------------------#
+
+# GFX9: v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x84 0x7c 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x98,0x7c,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x98 0x7c 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xa8,0x7c,0x01,0x00,0x02,0x04]
+0xf9 0x04 0xa8 0x7c 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xbc,0x7c,0x01,0x00,0x02,0x04]
+0xf9 0x04 0xbc 0x7c 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x82,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x82 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x8e,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x8e 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xa4,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0xa4 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xaa,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0xaa 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x90,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x90 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x98,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x98 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xb6,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0xb6 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xba,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0xba 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x20,0x7c,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x20 0x7c 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x22,0x7c,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x22 0x7c 0x01 0x00 0x02 0x04
+
+#-----------------------------------------------------------------------------#
+# Modifiers
+#-----------------------------------------------------------------------------#
+
+# GFX9: v_fract_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x25,0x06]
+0xf9 0x36 0x00 0x7e 0x00 0x06 0x25 0x06
+
+# GFX9: v_sin_f32_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x35,0x06]
+0xf9 0x52 0x00 0x7e 0x00 0x06 0x35 0x06
+
+# GFX9: v_add_f32_sdwa v0, -|v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x35,0x12]
+0xf9 0x00 0x00 0x02 0x00 0x06 0x35 0x12
+
+# GFX9: v_min_f32_sdwa v0, |v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x25,0x12]
+0xf9 0x00 0x00 0x14 0x00 0x06 0x25 0x12
+
+# GFX9: v_mov_b32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x0e,0x06]
+0xf9 0x02 0x02 0x7e 0x00 0x16 0x0e 0x06
+
+# GFX9: v_and_b32_sdwa v0, sext(v0), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x0e,0x0a]
+0xf9 0x00 0x00 0x26 0x00 0x06 0x0e 0x0a
+
+# GFX9: v_cmp_class_f32_sdwa vcc, -v1, sext(v2) src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x20,0x7c,0x01,0x00,0x12,0x0c]
+0xf9 0x04 0x20 0x7c 0x01 0x00 0x12 0x0c
+
+#===------------------------------------------------------------------------===#
+# Scalar registers are allowed
+#===------------------------------------------------------------------------===#
+
+# GFX9: v_mov_b32_sdwa v1, s2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x02,0x10,0x86,0x06]
+0xf9 0x02 0x02 0x7e 0x02 0x10 0x86 0x06
+
+# GFX9: v_mov_b32_sdwa v1, exec_lo dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x7e,0x10,0x86,0x06]
+0xf9 0x02 0x02 0x7e 0x7e 0x10 0x86 0x06
+
+# GFX9: v_add_f32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x85,0x02]
+0xf9 0x00 0x00 0x02 0x00 0x06 0x85 0x02
+
+# GFX9: v_add_f32_sdwa v0, v0, s22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x2c,0x00,0x02,0x00,0x06,0x05,0x82]
+0xf9 0x2c 0x00 0x02 0x00 0x06 0x05 0x82
+
+# GFX9: v_cmp_eq_f32_sdwa vcc, s1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x00,0x85,0x02]
+0xf9 0x04 0x84 0x7c 0x01 0x00 0x85 0x02
+
+# GFX9: v_cmp_eq_f32_sdwa vcc, v1, s22 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x2c,0x84,0x7c,0x01,0x00,0x05,0x82]
+0xf9 0x2c 0x84 0x7c 0x01 0x00 0x05 0x82
+
+#===------------------------------------------------------------------------===#
+# VOPC with arbitrary SGPR destination
+#===------------------------------------------------------------------------===#
+
+# GFX9: v_cmp_eq_f32_sdwa s[2:3], v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x82,0x05,0x02]
+0xf9 0x04 0x84 0x7c 0x01 0x82 0x05 0x02
+
+# GFX9: v_cmp_eq_f32_sdwa exec, v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0xfe,0x05,0x02]
+0xf9 0x04 0x84 0x7c 0x01 0xfe 0x05 0x02
+
+# GFX9: v_cmp_eq_f32_sdwa exec, s2, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x02,0xfe,0x85,0x02]
+0xf9 0x04 0x84 0x7c 0x02 0xfe 0x85 0x02
+
+#===------------------------------------------------------------------------===#
+# OMod output modifier allowed
+#===------------------------------------------------------------------------===#
+
+# GFX9: v_trunc_f32_sdwa v1, v2 mul:2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x38,0x02,0x7e,0x02,0x50,0x06,0x06]
+0xf9 0x38 0x02 0x7e 0x02 0x50 0x06 0x06
+
+# GFX9: v_trunc_f32_sdwa v1, v2 clamp div:2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x38,0x02,0x7e,0x02,0xf0,0x06,0x06]
+0xf9 0x38 0x02 0x7e 0x02 0xf0 0x06 0x06
+
+# GFX9: v_add_f32_sdwa v0, v0, v0 mul:2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x46,0x05,0x02]
+0xf9 0x00 0x00 0x02 0x00 0x46 0x05 0x02
+
+# GFX9: v_add_f32_sdwa v0, v0, v0 clamp div:2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0xe6,0x05,0x02]
+0xf9 0x00 0x00 0x02 0x00 0xe6 0x05 0x02
diff --git a/test/MC/Disassembler/X86/avx-512.txt b/test/MC/Disassembler/X86/avx-512.txt
index b0d1009476f5..7eda07f0d30c 100644
--- a/test/MC/Disassembler/X86/avx-512.txt
+++ b/test/MC/Disassembler/X86/avx-512.txt
@@ -1,5 +1,6 @@
 # RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s
 # RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mcpu=skx | FileCheck --check-prefix=CHECK-SKX %s
+# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=AVX512VPOPCNTDQ
 
 # CHECK: vpbroadcastd    %xmm18, %zmm28 {%k7} {z}
 0x62 0x22 0x7d 0xcf 0x58 0xe2
@@ -265,3 +266,25 @@
 
 # CHECK: vscatterqpd %ymm19, 256(%r9,%ymm31) {%k1}
 0x62 0x82 0xfd 0x21 0xa3 0x5c 0x39 0x20
+
+#####################################################
+#             POPULATION COUNT                      #
+#####################################################
+
+# AVX512VPOPCNTDQ: vpopcntd   %zmm21, %zmm26 {%k4}
+0x62 0x22 0x7d 0x4c 0x55 0xd5
+
+# AVX512VPOPCNTDQ: vpopcntd   %zmm21, %zmm26 {%k4} {z} 
+0x62 0x22 0x7d 0xcc 0x55 0xd5
+
+# AVX512VPOPCNTDQ: vpopcntd   (%rcx), %zmm26  
+0x62 0x62 0x7d 0x48 0x55 0x11
+
+# AVX512VPOPCNTDQ: vpopcntq   %zmm21, %zmm17 {%k6} 
+0x62 0xa2 0xfd 0x4e 0x55 0xcd
+
+# AVX512VPOPCNTDQ: vpopcntq   %zmm21, %zmm17 {%k6} {z} 
+0x62 0xa2 0xfd 0xce 0x55 0xcd
+
+# AVX512VPOPCNTDQ: vpopcntq   (%rcx), %zmm17  
+0x62 0xe2 0xfd 0x48 0x55 0x09
diff --git a/test/MC/WebAssembly/unnamed-data.ll b/test/MC/WebAssembly/unnamed-data.ll
new file mode 100644
index 000000000000..77a7c08f6594
--- /dev/null
+++ b/test/MC/WebAssembly/unnamed-data.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple wasm32-unknown-unknown-wasm -filetype=obj %s -o - | obj2yaml | FileCheck %s
+
+@.str1 = private unnamed_addr constant [6 x i8] c"hello\00", align 1
+@.str2 = private unnamed_addr constant [6 x i8] c"world\00", align 1
+
+@a = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str1, i32 0, i32 0), align 8
+@b = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str2, i32 0, i32 0), align 8
+
+
+; CHECK:   - Type:            GLOBAL
+; CHECK:     Globals:         
+; CHECK:       - Type:            I32
+; CHECK:         Mutable:         false
+; CHECK:         InitExpr:        
+; CHECK:           Opcode:          I32_CONST
+; CHECK:           Value:           0
+; CHECK:       - Type:            I32
+; CHECK:         Mutable:         false
+; CHECK:         InitExpr:        
+; CHECK:           Opcode:          I32_CONST
+; CHECK:           Value:           6
+; CHECK:       - Type:            I32
+; CHECK:         Mutable:         false
+; CHECK:         InitExpr:        
+; CHECK:           Opcode:          I32_CONST
+; CHECK:           Value:           16
+; CHECK:       - Type:            I32
+; CHECK:         Mutable:         false
+; CHECK:         InitExpr:        
+; CHECK:           Opcode:          I32_CONST
+; CHECK:           Value:           24
+; CHECK:   - Type:            EXPORT
+; CHECK:     Exports:         
+; CHECK:       - Name:            a
+; CHECK:         Kind:            GLOBAL
+; CHECK:         Index:           2
+; CHECK:       - Name:            b
+; CHECK:         Kind:            GLOBAL
+; CHECK:         Index:           3
+; CHECK:   - Type:            DATA
+; CHECK:     Relocations:     
+; CHECK:       - Type:            R_WEBASSEMBLY_GLOBAL_ADDR_I32
+; CHECK:         Index:           0
+; CHECK:         Offset:          0x00000016
+; CHECK:       - Type:            R_WEBASSEMBLY_GLOBAL_ADDR_I32
+; CHECK:         Index:           1
+; CHECK:         Offset:          0x0000001E
+; CHECK:     Segments:        
+; CHECK:       - Index:           0
+; CHECK:         Offset:          
+; CHECK:           Opcode:          I32_CONST
+; CHECK:           Value:           0
+; CHECK:         Content:         68656C6C6F00776F726C640000000000000000000000000006000000
diff --git a/test/MC/X86/pr22004.s b/test/MC/X86/pr22004.s
new file mode 100644
index 000000000000..3ef1526e43ae
--- /dev/null
+++ b/test/MC/X86/pr22004.s
@@ -0,0 +1,3 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel %s
+
+lea rax, qword ptr [rip + .L.str]
diff --git a/test/MC/X86/x86-64-avx512vpopcntdq.s b/test/MC/X86/x86-64-avx512vpopcntdq.s
new file mode 100644
index 000000000000..925d2c6308e4
--- /dev/null
+++ b/test/MC/X86/x86-64-avx512vpopcntdq.s
@@ -0,0 +1,225 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avx512vpopcntdq --show-encoding %s | FileCheck %s
+
+// CHECK: vpopcntq   %zmm25, %zmm20  
+// CHECK: encoding: [0x62,0x82,0xfd,0x48,0x55,0xe1]
+          vpopcntq   %zmm25, %zmm20  
+
+// CHECK: vpopcntq   %zmm25, %zmm20 {%k6} 
+// CHECK: encoding: [0x62,0x82,0xfd,0x4e,0x55,0xe1]
+          vpopcntq   %zmm25, %zmm20 {%k6} 
+
+// CHECK: vpopcntq   %zmm25, %zmm20 {%k6} {z} 
+// CHECK: encoding: [0x62,0x82,0xfd,0xce,0x55,0xe1]
+          vpopcntq   %zmm25, %zmm20 {%k6} {z} 
+
+// CHECK: vpopcntq   (%rcx), %zmm20  
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x21]
+          vpopcntq   (%rcx), %zmm20  
+
+// CHECK: vpopcntq   291(%rax,%r14,8), %zmm20 
+// CHECK: encoding: [0x62,0xa2,0xfd,0x48,0x55,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpopcntq   291(%rax,%r14,8), %zmm20 
+
+// CHECK: vpopcntq   (%rcx){1to8}, %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x21]
+          vpopcntq   (%rcx){1to8}, %zmm20 
+
+// CHECK: vpopcntq   4064(%rdx), %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0xa2,0xe0,0x0f,0x00,0x00]
+          vpopcntq   4064(%rdx), %zmm20 
+
+// CHECK: vpopcntq   4096(%rdx), %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x62,0x40]
+          vpopcntq   4096(%rdx), %zmm20 
+
+// CHECK: vpopcntq   -4096(%rdx), %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x62,0xc0]
+          vpopcntq   -4096(%rdx), %zmm20 
+
+// CHECK: vpopcntq   -4128(%rdx), %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0xa2,0xe0,0xef,0xff,0xff]
+          vpopcntq   -4128(%rdx), %zmm20 
+
+// CHECK: vpopcntq   1016(%rdx){1to8}, %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x62,0x7f]
+          vpopcntq   1016(%rdx){1to8}, %zmm20 
+
+// CHECK: vpopcntq   1024(%rdx){1to8}, %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0xa2,0x00,0x04,0x00,0x00]
+          vpopcntq   1024(%rdx){1to8}, %zmm20 
+
+// CHECK: vpopcntq   -1024(%rdx){1to8}, %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x62,0x80]
+          vpopcntq   -1024(%rdx){1to8}, %zmm20 
+
+// CHECK: vpopcntq   -1032(%rdx){1to8}, %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0xa2,0xf8,0xfb,0xff,0xff]
+          vpopcntq   -1032(%rdx){1to8}, %zmm20 
+
+// CHECK: vpopcntq   %zmm21, %zmm17  
+// CHECK: encoding: [0x62,0xa2,0xfd,0x48,0x55,0xcd]
+          vpopcntq   %zmm21, %zmm17  
+
+// CHECK: vpopcntq   %zmm21, %zmm17 {%k6} 
+// CHECK: encoding: [0x62,0xa2,0xfd,0x4e,0x55,0xcd]
+          vpopcntq   %zmm21, %zmm17 {%k6} 
+
+// CHECK: vpopcntq   %zmm21, %zmm17 {%k6} {z} 
+// CHECK: encoding: [0x62,0xa2,0xfd,0xce,0x55,0xcd]
+          vpopcntq   %zmm21, %zmm17 {%k6} {z} 
+
+// CHECK: vpopcntq   (%rcx), %zmm17  
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x09]
+          vpopcntq   (%rcx), %zmm17  
+
+// CHECK: vpopcntq   4660(%rax,%r14,8), %zmm17 
+// CHECK: encoding: [0x62,0xa2,0xfd,0x48,0x55,0x8c,0xf0,0x34,0x12,0x00,0x00]
+          vpopcntq   4660(%rax,%r14,8), %zmm17 
+
+// CHECK: vpopcntq   (%rcx){1to8}, %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x09]
+          vpopcntq   (%rcx){1to8}, %zmm17 
+
+// CHECK: vpopcntq   4064(%rdx), %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x8a,0xe0,0x0f,0x00,0x00]
+          vpopcntq   4064(%rdx), %zmm17 
+
+// CHECK: vpopcntq   4096(%rdx), %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x4a,0x40]
+          vpopcntq   4096(%rdx), %zmm17 
+
+// CHECK: vpopcntq   -4096(%rdx), %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x4a,0xc0]
+          vpopcntq   -4096(%rdx), %zmm17 
+
+// CHECK: vpopcntq   -4128(%rdx), %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x8a,0xe0,0xef,0xff,0xff]
+          vpopcntq   -4128(%rdx), %zmm17 
+
+// CHECK: vpopcntq   1016(%rdx){1to8}, %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x4a,0x7f]
+          vpopcntq   1016(%rdx){1to8}, %zmm17 
+
+// CHECK: vpopcntq   1024(%rdx){1to8}, %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x8a,0x00,0x04,0x00,0x00]
+          vpopcntq   1024(%rdx){1to8}, %zmm17 
+
+// CHECK: vpopcntq   -1024(%rdx){1to8}, %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x4a,0x80]
+          vpopcntq   -1024(%rdx){1to8}, %zmm17 
+
+// CHECK: vpopcntq   -1032(%rdx){1to8}, %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x8a,0xf8,0xfb,0xff,0xff]
+          vpopcntq   -1032(%rdx){1to8}, %zmm17 
+
+// CHECK: vpopcntd   %zmm19, %zmm25  
+// CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0xcb]
+          vpopcntd   %zmm19, %zmm25  
+
+// CHECK: vpopcntd   %zmm19, %zmm25 {%k4} 
+// CHECK: encoding: [0x62,0x22,0x7d,0x4c,0x55,0xcb]
+          vpopcntd   %zmm19, %zmm25 {%k4} 
+
+// CHECK: vpopcntd   %zmm19, %zmm25 {%k4} {z} 
+// CHECK: encoding: [0x62,0x22,0x7d,0xcc,0x55,0xcb]
+          vpopcntd   %zmm19, %zmm25 {%k4} {z} 
+
+// CHECK: vpopcntd   (%rcx), %zmm25  
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x09]
+          vpopcntd   (%rcx), %zmm25  
+
+// CHECK: vpopcntd   291(%rax,%r14,8), %zmm25 
+// CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpopcntd   291(%rax,%r14,8), %zmm25 
+
+// CHECK: vpopcntd   (%rcx){1to16}, %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x09]
+          vpopcntd   (%rcx){1to16}, %zmm25
+
+// CHECK: vpopcntd   4064(%rdx), %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x8a,0xe0,0x0f,0x00,0x00]
+          vpopcntd   4064(%rdx), %zmm25 
+
+// CHECK: vpopcntd   4096(%rdx), %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x4a,0x40]
+          vpopcntd   4096(%rdx), %zmm25 
+
+// CHECK: vpopcntd   -4096(%rdx), %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x4a,0xc0]
+          vpopcntd   -4096(%rdx), %zmm25 
+
+// CHECK: vpopcntd   -4128(%rdx), %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x8a,0xe0,0xef,0xff,0xff]
+          vpopcntd   -4128(%rdx), %zmm25 
+
+// CHECK: vpopcntd   508(%rdx){1to16}, %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x4a,0x7f]
+          vpopcntd   508(%rdx){1to16}, %zmm25 
+
+// CHECK: vpopcntd   512(%rdx){1to16}, %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x8a,0x00,0x02,0x00,0x00]
+          vpopcntd   512(%rdx){1to16}, %zmm25 
+
+// CHECK: vpopcntd   -512(%rdx){1to16}, %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x4a,0x80]
+          vpopcntd   -512(%rdx){1to16}, %zmm25 
+
+// CHECK: vpopcntd   -516(%rdx){1to16}, %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x8a,0xfc,0xfd,0xff,0xff]
+          vpopcntd   -516(%rdx){1to16}, %zmm25 
+
+// CHECK: vpopcntd   %zmm21, %zmm26  
+// CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0xd5]
+          vpopcntd   %zmm21, %zmm26  
+
+// CHECK: vpopcntd   %zmm21, %zmm26 {%k4} 
+// CHECK: encoding: [0x62,0x22,0x7d,0x4c,0x55,0xd5]
+          vpopcntd   %zmm21, %zmm26 {%k4} 
+
+// CHECK: vpopcntd   %zmm21, %zmm26 {%k4} {z} 
+// CHECK: encoding: [0x62,0x22,0x7d,0xcc,0x55,0xd5]
+          vpopcntd   %zmm21, %zmm26 {%k4} {z} 
+
+// CHECK: vpopcntd   (%rcx), %zmm26  
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x11]
+          vpopcntd   (%rcx), %zmm26  
+
+// CHECK: vpopcntd   4660(%rax,%r14,8), %zmm26 
+// CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0x94,0xf0,0x34,0x12,0x00,0x00]
+          vpopcntd   4660(%rax,%r14,8), %zmm26 
+
+// CHECK: vpopcntd   (%rcx){1to16}, %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x11]
+          vpopcntd   (%rcx){1to16}, %zmm26 
+
+// CHECK: vpopcntd   4064(%rdx), %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x92,0xe0,0x0f,0x00,0x00]
+          vpopcntd   4064(%rdx), %zmm26 
+
+// CHECK: vpopcntd   4096(%rdx), %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x52,0x40]
+          vpopcntd   4096(%rdx), %zmm26 
+
+// CHECK: vpopcntd   -4096(%rdx), %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x52,0xc0]
+          vpopcntd   -4096(%rdx), %zmm26 
+
+// CHECK: vpopcntd   -4128(%rdx), %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x92,0xe0,0xef,0xff,0xff]
+          vpopcntd   -4128(%rdx), %zmm26 
+
+// CHECK: vpopcntd   508(%rdx){1to16}, %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x52,0x7f]
+          vpopcntd   508(%rdx){1to16}, %zmm26 
+
+// CHECK: vpopcntd   512(%rdx){1to16}, %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x92,0x00,0x02,0x00,0x00]
+          vpopcntd   512(%rdx){1to16}, %zmm26 
+
+// CHECK: vpopcntd   -512(%rdx){1to16}, %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x52,0x80]
+          vpopcntd   -512(%rdx){1to16}, %zmm26 
+
+// CHECK: vpopcntd   -516(%rdx){1to16}, %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x92,0xfc,0xfd,0xff,0xff]
+          vpopcntd   -516(%rdx){1to16}, %zmm26 
diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll
index f712dc7b63ca..0ec356392a2d 100644
--- a/test/Other/new-pm-defaults.ll
+++ b/test/Other/new-pm-defaults.ll
@@ -95,6 +95,7 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LICM
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
+; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
 ; CHECK-O-NEXT: Finished Loop pass manager run.
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
diff --git a/test/TableGen/GlobalISelEmitter.td b/test/TableGen/GlobalISelEmitter.td
index aeac85962f63..8778ad71ea72 100644
--- a/test/TableGen/GlobalISelEmitter.td
+++ b/test/TableGen/GlobalISelEmitter.td
@@ -462,6 +462,32 @@ def XORManyDefaults : I<(outs GPR32:$dst), (ins m1Z:$src3, Z:$src2, GPR32:$src1)
 def ORN : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2), []>;
 def : Pat<(not GPR32:$Wm), (ORN R0, GPR32:$Wm)>;
 
+//===- Test a simple pattern with just a leaf immediate. ------------------===//
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT:    MachineInstr &MI0 = I;
+// CHECK-NEXT:    if (MI0.getNumOperands() < 2)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if ((MI0.getOpcode() == TargetOpcode::G_CONSTANT) &&
+// CHECK-NEXT:        ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* Operand 1 */ (MI0.getOperand(1).isCImm() && MI0.getOperand(1).getCImm()->equalsInt(1))))) {
+// CHECK-NEXT:      // 1:i32 => (MOV1:i32)
+// CHECK-NEXT:      MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MOV1));
+// CHECK-NEXT:      MIB.add(MI0.getOperand(0)/*dst*/);
+// CHECK-NEXT:      for (const auto *FromMI : {&MI0, })
+// CHECK-NEXT:        for (const auto &MMO : FromMI->memoperands())
+// CHECK-NEXT:          MIB.addMemOperand(MMO);
+// CHECK-NEXT:      I.eraseFromParent();
+// CHECK-NEXT:      MachineInstr &NewI = *MIB;
+// CHECK-NEXT:      constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT:      return true;
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return false;
+// CHECK-NEXT:  }()) { return true; }
+
+def MOV1 : I<(outs GPR32:$dst), (ins), [(set GPR32:$dst, 1)]>;
+
 //===- Test a pattern with an MBB operand. --------------------------------===//
 
 // CHECK-LABEL: if ([&]() {
diff --git a/test/Transforms/Coroutines/coro-debug.ll b/test/Transforms/Coroutines/coro-debug.ll
new file mode 100644
index 000000000000..4da545499f94
--- /dev/null
+++ b/test/Transforms/Coroutines/coro-debug.ll
@@ -0,0 +1,142 @@
+; Tests that debug information is sane after coro-split
+; RUN: opt < %s -coro-split -S | FileCheck %s
+
+source_filename = "simple-repro.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind
+define i8* @f(i32 %x) #0 !dbg !6 {
+entry:
+  %x.addr = alloca i32, align 4
+  %coro_hdl = alloca i8*, align 8
+  store i32 %x, i32* %x.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !12, metadata !13), !dbg !14
+  call void @llvm.dbg.declare(metadata i8** %coro_hdl, metadata !15, metadata !13), !dbg !16
+  %0 = call token @llvm.coro.id(i32 0, i8* null, i8* bitcast (i8* (i32)* @f to i8*), i8* null), !dbg !16
+  %1 = call i64 @llvm.coro.size.i64(), !dbg !16
+  %call = call i8* @malloc(i64 %1), !dbg !16
+  %2 = call i8* @llvm.coro.begin(token %0, i8* %call) #7, !dbg !16
+  store i8* %2, i8** %coro_hdl, align 8, !dbg !16
+  %3 = call i8 @llvm.coro.suspend(token none, i1 false), !dbg !17
+  %conv = sext i8 %3 to i32, !dbg !17
+  call void @coro.devirt.trigger(i8* null)
+  switch i32 %conv, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+  ], !dbg !17
+
+sw.bb:                                            ; preds = %entry
+  br label %sw.epilog, !dbg !18
+
+sw.bb1:                                           ; preds = %entry
+  br label %coro_Cleanup, !dbg !18
+
+sw.default:                                       ; preds = %entry
+  br label %coro_Suspend, !dbg !18
+
+sw.epilog:                                        ; preds = %sw.bb
+  %4 = load i32, i32* %x.addr, align 4, !dbg !20
+  %add = add nsw i32 %4, 1, !dbg !21
+  store i32 %add, i32* %x.addr, align 4, !dbg !22
+  br label %coro_Cleanup, !dbg !23
+
+coro_Cleanup:                                     ; preds = %sw.epilog, %sw.bb1
+  %5 = load i8*, i8** %coro_hdl, align 8, !dbg !24
+  %6 = call i8* @llvm.coro.free(token %0, i8* %5), !dbg !24
+  call void @free(i8* %6), !dbg !24
+  br label %coro_Suspend, !dbg !24
+
+coro_Suspend:                                     ; preds = %coro_Cleanup, %sw.default
+  %7 = call i1 @llvm.coro.end(i8* null, i1 false) #7, !dbg !24
+  %8 = load i8*, i8** %coro_hdl, align 8, !dbg !24
+  ret i8* %8, !dbg !24
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: argmemonly nounwind readonly
+declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*) #2
+
+declare i8* @malloc(i64) #3
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.coro.size.i64() #4
+
+; Function Attrs: nounwind
+declare i8* @llvm.coro.begin(token, i8* writeonly) #5
+
+; Function Attrs: nounwind
+declare i8 @llvm.coro.suspend(token, i1) #5
+
+declare void @free(i8*) #3
+
+; Function Attrs: argmemonly nounwind readonly
+declare i8* @llvm.coro.free(token, i8* nocapture readonly) #2
+
+; Function Attrs: nounwind
+declare i1 @llvm.coro.end(i8*, i1) #5
+
+; Function Attrs: alwaysinline
+define private void @coro.devirt.trigger(i8*) #6 {
+entry:
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare i8* @llvm.coro.subfn.addr(i8* nocapture readonly, i8) #2
+
+attributes #0 = { noinline nounwind "coroutine.presplit"="1" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { argmemonly nounwind readonly }
+attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind readnone }
+attributes #5 = { nounwind }
+attributes #6 = { alwaysinline }
+attributes #7 = { noduplicate }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 97b002238b11ff30d94d0516d6a0515db5725fd8) (http://llvm.org/git/llvm.git 0cb060ba567f1aa5b4b04e86665f88e4632b528a)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "C:\5CGitHub\5Cllvm\5Cbuild\5CDebug\5Cbin")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 5.0.0 (http://llvm.org/git/clang.git 97b002238b11ff30d94d0516d6a0515db5725fd8) (http://llvm.org/git/llvm.git 0cb060ba567f1aa5b4b04e86665f88e4632b528a)"}
+!6 = distinct !DISubprogram(name: "f", linkageName: "flink", scope: !7, file: !7, line: 55, type: !8, isLocal: false, isDefinition: true, scopeLine: 55, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DIFile(filename: "simple-repro.c", directory: "C:\5CGitHub\5Cllvm\5Cbuild\5CDebug\5Cbin")
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !11}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DILocalVariable(name: "x", arg: 1, scope: !6, file: !7, line: 55, type: !11)
+!13 = !DIExpression()
+!14 = !DILocation(line: 55, column: 13, scope: !6)
+!15 = !DILocalVariable(name: "coro_hdl", scope: !6, file: !7, line: 56, type: !10)
+!16 = !DILocation(line: 56, column: 3, scope: !6)
+!17 = !DILocation(line: 58, column: 5, scope: !6)
+!18 = !DILocation(line: 58, column: 5, scope: !19)
+!19 = distinct !DILexicalBlock(scope: !6, file: !7, line: 58, column: 5)
+!20 = !DILocation(line: 59, column: 9, scope: !6)
+!21 = !DILocation(line: 59, column: 10, scope: !6)
+!22 = !DILocation(line: 59, column: 7, scope: !6)
+!23 = !DILocation(line: 59, column: 5, scope: !6)
+!24 = !DILocation(line: 62, column: 3, scope: !6)
+
+; CHECK: define i8* @f(i32 %x) #0 !dbg ![[ORIG:[0-9]+]]
+; CHECK: define internal fastcc void @f.resume(%f.Frame* %FramePtr) #0 !dbg ![[RESUME:[0-9]+]]
+; CHECK: define internal fastcc void @f.destroy(%f.Frame* %FramePtr) #0 !dbg ![[DESTROY:[0-9]+]]
+; CHECK: define internal fastcc void @f.cleanup(%f.Frame* %FramePtr) #0 !dbg ![[CLEANUP:[0-9]+]]
+
+; CHECK: ![[ORIG]] = distinct !DISubprogram(name: "f", linkageName: "flink"
+; CHECK: !DILocalVariable(name: "x", arg: 1, scope: ![[ORIG]]
+
+; CHECK: ![[RESUME]] = distinct !DISubprogram(name: "f", linkageName: "flink"
+; CHECK: !DILocalVariable(name: "x", arg: 1, scope: ![[RESUME]]
+
+; CHECK: ![[DESTROY]] = distinct !DISubprogram(name: "f", linkageName: "flink"
+
+; CHECK: ![[CLEANUP]] = distinct !DISubprogram(name: "f", linkageName: "flink"
diff --git a/test/Transforms/Coroutines/coro-frame.ll b/test/Transforms/Coroutines/coro-frame.ll
index 001012fcd0c9..826d3a04fa1e 100644
--- a/test/Transforms/Coroutines/coro-frame.ll
+++ b/test/Transforms/Coroutines/coro-frame.ll
@@ -1,8 +1,11 @@
 ; Check that we can handle spills of the result of the invoke instruction
 ; RUN: opt < %s -coro-split -S | FileCheck %s
 
-define i8* @f() "coroutine.presplit"="1" personality i32 0 {
+define i8* @f(i64 %this) "coroutine.presplit"="1" personality i32 0 {
 entry:
+  %this.addr = alloca i64
+  store i64 %this, i64* %this.addr
+  %this1 = load i64, i64* %this.addr
   %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
   %size = call i32 @llvm.coro.size.i32()
   %alloc = call i8* @malloc(i32 %size)
@@ -15,6 +18,7 @@ cont:
                                 i8 1, label %cleanup]
 resume:
   call double @print(double %r)
+  call void @print2(i64 %this1)
   br label %cleanup
 
 cleanup:
@@ -30,12 +34,12 @@ pad:
 }
 
 ; See if the float was added to the frame
-; CHECK-LABEL: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i1, i1, double }
+; CHECK-LABEL: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i1, i1, i64, double }
 
 ; See if the float was spilled into the frame
 ; CHECK-LABEL: @f(
 ; CHECK: %r = call double @print(
-; CHECK: %r.spill.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 4
+; CHECK: %r.spill.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 5
 ; CHECK: store double %r, double* %r.spill.addr
 ; CHECK: ret i8* %hdl
 
@@ -58,4 +62,5 @@ declare i1 @llvm.coro.end(i8*, i1)
 
 declare noalias i8* @malloc(i32)
 declare double @print(double)
+declare void @print2(i64)
 declare void @free(i8*)
diff --git a/test/Transforms/Coroutines/coro-materialize.ll b/test/Transforms/Coroutines/coro-materialize.ll
new file mode 100644
index 000000000000..95e8a049ad2f
--- /dev/null
+++ b/test/Transforms/Coroutines/coro-materialize.ll
@@ -0,0 +1,52 @@
+; Verifies that we materialize instruction across suspend points
+; RUN: opt < %s -coro-split -S | FileCheck %s
+
+define i8* @f(i32 %n) "coroutine.presplit"="1" {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call i8* @malloc(i32 %size)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+
+  %inc1 = add i32 %n, 1
+  %sp1 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %sp1, label %suspend [i8 0, label %resume1
+                                  i8 1, label %cleanup]
+resume1:
+  %inc2 = add i32 %inc1, 1
+  %sp2 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %sp1, label %suspend [i8 0, label %resume2
+                                  i8 1, label %cleanup]
+
+resume2:
+  call void @print(i32 %inc1)
+  call void @print(i32 %inc2)
+  br label %cleanup
+
+cleanup:
+  %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+  call void @free(i8* %mem)
+  br label %suspend
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)
+  ret i8* %hdl
+}
+
+; See that we only spilled one value
+; CHECK: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i1, i1, i32 }
+; CHECK-LABEL: @f(
+
+declare i8* @llvm.coro.free(token, i8*)
+declare i32 @llvm.coro.size.i32()
+declare i8  @llvm.coro.suspend(token, i1)
+declare void @llvm.coro.resume(i8*)
+declare void @llvm.coro.destroy(i8*)
+
+declare token @llvm.coro.id(i32, i8*, i8*, i8*)
+declare i1 @llvm.coro.alloc(token)
+declare i8* @llvm.coro.begin(token, i8*)
+declare i1 @llvm.coro.end(i8*, i1)
+
+declare noalias i8* @malloc(i32)
+declare void @print(i32)
+declare void @free(i8*)
diff --git a/test/Transforms/EarlyCSE/const-speculation.ll b/test/Transforms/EarlyCSE/const-speculation.ll
new file mode 100644
index 000000000000..5b7f2f5b6982
--- /dev/null
+++ b/test/Transforms/EarlyCSE/const-speculation.ll
@@ -0,0 +1,39 @@
+; RUN: opt -early-cse -S %s | FileCheck %s
+
+%mystruct = type { i32 }
+
+; @var is global so that *every* GEP argument is Constant.
+@var = external global %mystruct
+
+; Control flow is to make the dominance tree consider the final icmp before it
+; gets to simplify the purely constant one (%tst). Since that icmp uses the
+; select that gets considered next. Finally the select simplification looks at
+; the %tst icmp and we don't want it to speculate about what happens if "i32 0"
+; is actually "i32 1", broken universes are automatic UB.
+;
+; In this case doing the speculation would create an invalid GEP(@var, 0, 1) and
+; crash.
+
+define i1 @test_constant_speculation() {
+; CHECK-LABEL: define i1 @test_constant_speculation
+entry:
+  br i1 undef, label %end, label %select
+
+select:
+; CHECK: select:
+; CHECK-NOT: icmp
+; CHECK-NOT: getelementptr
+; CHECK-NOT: select
+
+  %tst = icmp eq i32 1, 0
+  %elt = getelementptr %mystruct, %mystruct* @var, i64 0, i32 0
+  %sel = select i1 %tst, i32* null, i32* %elt
+  br label %end
+
+end:
+; CHECK: end:
+; CHECK: %tmp = phi i32* [ null, %entry ], [ getelementptr inbounds (%mystruct, %mystruct* @var, i64 0, i32 0), %select ]
+  %tmp = phi i32* [null, %entry], [%sel, %select]
+  %res = icmp eq i32* %tmp, null
+  ret i1 %res
+}
diff --git a/test/Transforms/GVN/PRE/phi-translate-2.ll b/test/Transforms/GVN/PRE/phi-translate-2.ll
new file mode 100644
index 000000000000..b2993657c7f5
--- /dev/null
+++ b/test/Transforms/GVN/PRE/phi-translate-2.ll
@@ -0,0 +1,105 @@
+; RUN: opt < %s -gvn -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@a = common global [100 x i64] zeroinitializer, align 16
+@b = common global [100 x i64] zeroinitializer, align 16
+@g1 = common global i64 0, align 8
+@g2 = common global i64 0, align 8
+@g3 = common global i64 0, align 8
+declare i64 @goo(...) local_unnamed_addr #1
+
+define void @test1(i64 %a, i64 %b, i64 %c, i64 %d) {
+entry:
+  %mul = mul nsw i64 %b, %a
+  store i64 %mul, i64* @g1, align 8
+  %t0 = load i64, i64* @g2, align 8
+  %cmp = icmp sgt i64 %t0, 3
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %mul2 = mul nsw i64 %d, %c
+  store i64 %mul2, i64* @g2, align 8
+  br label %if.end
+
+; Check phi-translate works and mul is removed.
+; CHECK-LABEL: @test1(
+; CHECK: if.end:
+; CHECK: %[[MULPHI:.*]] = phi i64 [ {{.*}}, %if.then ], [ %mul, %entry ]
+; CHECK-NOT: = mul
+; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8
+if.end:                                           ; preds = %if.then, %entry
+  %b.addr.0 = phi i64 [ %d, %if.then ], [ %b, %entry ]
+  %a.addr.0 = phi i64 [ %c, %if.then ], [ %a, %entry ]
+  %mul3 = mul nsw i64 %a.addr.0, %b.addr.0
+  store i64 %mul3, i64* @g3, align 8
+  ret void
+}
+
+define void @test2(i64 %i) {
+entry:
+  %arrayidx = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i
+  %t0 = load i64, i64* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i
+  %t1 = load i64, i64* %arrayidx1, align 8
+  %mul = mul nsw i64 %t1, %t0
+  store i64 %mul, i64* @g1, align 8
+  %cmp = icmp sgt i64 %mul, 3
+  br i1 %cmp, label %if.then, label %if.end
+
+; Check phi-translate works for the phi generated by loadpre. A new mul will be
+; inserted in if.then block.
+; CHECK-LABEL: @test2(
+; CHECK: if.then:
+; CHECK: %[[MUL_THEN:.*]] = mul
+; CHECK: br label %if.end
+if.then:                                          ; preds = %entry
+  %call = tail call i64 (...) @goo() #2
+  store i64 %call, i64* @g2, align 8
+  br label %if.end
+
+; CHECK: if.end:
+; CHECK: %[[MULPHI:.*]] = phi i64 [ %[[MUL_THEN]], %if.then ], [ %mul, %entry ]
+; CHECK-NOT: = mul
+; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8
+if.end:                                           ; preds = %if.then, %entry
+  %i.addr.0 = phi i64 [ 3, %if.then ], [ %i, %entry ]
+  %arrayidx3 = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i.addr.0
+  %t2 = load i64, i64* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i.addr.0
+  %t3 = load i64, i64* %arrayidx4, align 8
+  %mul5 = mul nsw i64 %t3, %t2
+  store i64 %mul5, i64* @g3, align 8
+  ret void
+}
+
+; Check phi-translate doesn't go through backedge, which may lead to incorrect
+; pre transformation.
+; CHECK: for.end:
+; CHECK-NOT: %{{.*pre-phi}} = phi
+; CHECK: ret void
+define void @test3(i64 %N, i64* nocapture readonly %a) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %add = add nuw nsw i64 %i.0, 1
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %add
+  %tmp0 = load i64, i64* %arrayidx, align 8
+  %cmp = icmp slt i64 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %call = tail call i64 (...) @goo() #2
+  %add1 = sub nsw i64 0, %call
+  %tobool = icmp eq i64 %tmp0, %add1
+  br i1 %tobool, label %for.cond, label %for.end
+
+for.end:                                          ; preds = %for.body, %for.cond
+  %i.0.lcssa = phi i64 [ %i.0, %for.body ], [ %i.0, %for.cond ]
+  %arrayidx2 = getelementptr inbounds i64, i64* %a, i64 %i.0.lcssa
+  %tmp1 = load i64, i64* %arrayidx2, align 8
+  store i64 %tmp1, i64* @g1, align 8
+  ret void
+}
+
diff --git a/test/Transforms/GVN/PRE/pre-gep-load.ll b/test/Transforms/GVN/PRE/pre-gep-load.ll
index 9eec8bb6455b..1b2b4d20d31d 100644
--- a/test/Transforms/GVN/PRE/pre-gep-load.ll
+++ b/test/Transforms/GVN/PRE/pre-gep-load.ll
@@ -37,7 +37,7 @@ sw.bb2:                                           ; preds = %if.end, %entry
   %3 = load double, double* %arrayidx5, align 8
 ; CHECK: sw.bb2:
 ; CHECK-NOT: sext
-; CHECK-NEXT: phi double [
+; CHECK: phi double [
 ; CHECK-NOT: load
   %sub6 = fsub double 3.000000e+00, %3
   br label %return
diff --git a/test/Transforms/GVN/PRE/pre-load.ll b/test/Transforms/GVN/PRE/pre-load.ll
index 685df24f62b6..ffff2b7f08e5 100644
--- a/test/Transforms/GVN/PRE/pre-load.ll
+++ b/test/Transforms/GVN/PRE/pre-load.ll
@@ -72,7 +72,7 @@ block4:
   %PRE = load i32, i32* %P3
   ret i32 %PRE
 ; CHECK: block4:
-; CHECK-NEXT: phi i32 [
+; CHECK: phi i32 [
 ; CHECK-NOT: load
 ; CHECK: ret i32
 }
@@ -104,7 +104,7 @@ block4:
   %PRE = load i32, i32* %P3
   ret i32 %PRE
 ; CHECK: block4:
-; CHECK-NEXT: phi i32 [
+; CHECK: phi i32 [
 ; CHECK-NOT: load
 ; CHECK: ret i32
 }
@@ -263,7 +263,7 @@ block4:
   %PRE = load i32, i32* %P3
   ret i32 %PRE
 ; CHECK: block4:
-; CHECK-NEXT: phi i32 [
+; CHECK: phi i32 [
 ; CHECK-NOT: load
 ; CHECK: ret i32
 }
diff --git a/test/Transforms/GVNSink/dither.ll b/test/Transforms/GVNSink/dither.ll
new file mode 100644
index 000000000000..9717021aca82
--- /dev/null
+++ b/test/Transforms/GVNSink/dither.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -S -gvn-sink | FileCheck %s
+
+; Because %tmp17 has flipped operands to its equivalents %tmp14 and %tmp7, we
+; can't sink the zext as we'd need a shuffling PHI in between.
+;
+; Just sinking the zext isn't profitable, so ensure nothing is sunk.
+
+; CHECK-LABEL: @hoge
+; CHECK-NOT: bb18.gvnsink.split
+define void @hoge() {
+bb:
+  br i1 undef, label %bb4, label %bb11
+
+bb4:                                              ; preds = %bb3
+  br i1 undef, label %bb6, label %bb8
+
+bb6:                                              ; preds = %bb5
+  %tmp = zext i16 undef to i64
+  %tmp7 = add i64 %tmp, undef
+  br label %bb18
+
+bb8:                                              ; preds = %bb5
+  %tmp9 = zext i16 undef to i64
+  br label %bb18
+
+bb11:                                             ; preds = %bb10
+  br i1 undef, label %bb12, label %bb15
+
+bb12:                                             ; preds = %bb11
+  %tmp13 = zext i16 undef to i64
+  %tmp14 = add i64 %tmp13, undef
+  br label %bb18
+
+bb15:                                             ; preds = %bb11
+  %tmp16 = zext i16 undef to i64
+  %tmp17 = add i64 undef, %tmp16
+  br label %bb18
+
+bb18:                                             ; preds = %bb15, %bb12, %bb8, %bb6
+  %tmp19 = phi i64 [ %tmp7, %bb6 ], [ undef, %bb8 ], [ %tmp14, %bb12 ], [ %tmp17, %bb15 ]
+  unreachable
+}
diff --git a/test/Transforms/GVNSink/indirect-call.ll b/test/Transforms/GVNSink/indirect-call.ll
new file mode 100644
index 000000000000..da98ed0819a6
--- /dev/null
+++ b/test/Transforms/GVNSink/indirect-call.ll
@@ -0,0 +1,70 @@
+; RUN: opt < %s -gvn-sink -simplifycfg -simplifycfg-sink-common=false -S | FileCheck %s
+
+declare i8 @ext(i1)
+
+define zeroext i1 @test1(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks, i8(i1)* %ext) {
+entry:
+  %cmp = icmp uge i32 %blksA, %nblks
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK-LABEL: test1
+; CHECK: call i8 @ext
+; CHECK: call i8 %ext
+if.then:
+  %frombool1 = call i8 @ext(i1 %cmp)
+  br label %if.end
+
+if.else:
+  %frombool3 = call i8 %ext(i1 %cmp)
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+define zeroext i1 @test2(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks, i8(i1)* %ext) {
+entry:
+  %cmp = icmp uge i32 %blksA, %nblks
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK-LABEL: test2
+; CHECK: call i8 %ext
+; CHECK-NOT: call
+if.then:
+  %frombool1 = call i8 %ext(i1 %cmp)
+  br label %if.end
+
+if.else:
+  %frombool3 = call i8 %ext(i1 %cmp)
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+define zeroext i1 @test3(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks, i8(i1)* %ext1, i8(i1)* %ext2) {
+entry:
+  %cmp = icmp uge i32 %blksA, %nblks
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK-LABEL: test3
+; CHECK: %[[x:.*]] = select i1 %flag, i8 (i1)* %ext1, i8 (i1)* %ext2
+; CHECK: call i8 %[[x]](i1 %cmp)
+; CHECK-NOT: call
+if.then:
+  %frombool1 = call i8 %ext1(i1 %cmp)
+  br label %if.end
+
+if.else:
+  %frombool3 = call i8 %ext2(i1 %cmp)
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
diff --git a/test/Transforms/GVNSink/sink-common-code.ll b/test/Transforms/GVNSink/sink-common-code.ll
new file mode 100644
index 000000000000..d9e757cd10fc
--- /dev/null
+++ b/test/Transforms/GVNSink/sink-common-code.ll
@@ -0,0 +1,694 @@
+; RUN: opt < %s -gvn-sink -simplifycfg -simplifycfg-sink-common=false -S | FileCheck %s
+
+define zeroext i1 @test1(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK-LABEL: test1
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+define zeroext i1 @test2(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK-LABEL: test2
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp uge i32 %blksA, %add
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+declare i32 @foo(i32, i32) nounwind readnone
+
+define i32 @test3(i1 zeroext %flag, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
+  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
+  br label %if.end
+
+if.else:
+  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
+  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
+  br label %if.end
+
+if.end:
+  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
+  %yy = phi i32 [ %y0, %if.then ], [ %y1, %if.else ]
+  %ret = add i32 %xx, %yy
+  ret i32 %ret
+}
+
+; CHECK-LABEL: test3
+; CHECK: select
+; CHECK: call
+; CHECK: call
+; CHECK: add
+; CHECK-NOT: br
+
+define i32 @test4(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %a = add i32 %x, 5
+  store i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %b = add i32 %x, 7
+  store i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test4
+; CHECK: select
+; CHECK: store
+; CHECK-NOT: store
+
+define i32 @test5(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %a = add i32 %x, 5
+  store volatile i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %b = add i32 %x, 7
+  store i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test5
+; CHECK: store volatile
+; CHECK: store
+
+define i32 @test6(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %a = add i32 %x, 5
+  store volatile i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %b = add i32 %x, 7
+  store volatile i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test6
+; CHECK: select
+; CHECK: store volatile
+; CHECK-NOT: store
+
+define i32 @test7(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %z = load volatile i32, i32* %y
+  %a = add i32 %z, 5
+  store volatile i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %w = load volatile i32, i32* %y
+  %b = add i32 %w, 7
+  store volatile i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test7
+; CHECK-DAG: select
+; CHECK-DAG: load volatile
+; CHECK: store volatile
+; CHECK-NOT: load
+; CHECK-NOT: store
+
+; The extra store in %if.then means %z and %w are not equivalent.
+define i32 @test9(i1 zeroext %flag, i32 %x, i32* %y, i32* %p) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  store i32 7, i32* %p
+  %z = load volatile i32, i32* %y
+  store i32 6, i32* %p
+  %a = add i32 %z, 5
+  store volatile i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %w = load volatile i32, i32* %y
+  %b = add i32 %w, 7
+  store volatile i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test9
+; CHECK: add
+; CHECK: add
+
+%struct.anon = type { i32, i32 }
+
+; The GEP indexes a struct type so cannot have a variable last index.
+define i32 @test10(i1 zeroext %flag, i32 %x, i32* %y, %struct.anon* %s) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %x, 5
+  %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 0
+  store volatile i32 %x, i32* %gepa
+  br label %if.end
+
+if.else:
+  %dummy1 = add i32 %x, 6
+  %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  store volatile i32 %x, i32* %gepb
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test10
+; CHECK: getelementptr
+; CHECK: store volatile
+; CHECK: getelementptr
+; CHECK: store volatile
+
+; The shufflevector's mask operand cannot be merged in a PHI.
+define i32 @test11(i1 zeroext %flag, i32 %w, <2 x i32> %x, <2 x i32> %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %w, 5
+  %sv1 = shufflevector <2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 0, i32 1>
+  br label %if.end
+
+if.else:
+  %dummy1 = add i32 %w, 6
+  %sv2 = shufflevector <2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 1, i32 0>
+  br label %if.end
+
+if.end:
+  %p = phi <2 x i32> [ %sv1, %if.then ], [ %sv2, %if.else ]
+  ret i32 1
+}
+
+; CHECK-LABEL: test11
+; CHECK: shufflevector
+; CHECK: shufflevector
+
+; We can't common an intrinsic!
+define i32 @test12(i1 zeroext %flag, i32 %w, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %w, 5
+  %sv1 = call i32 @llvm.ctlz.i32(i32 %x)
+  br label %if.end
+
+if.else:
+  %dummy1 = add i32 %w, 6
+  %sv2 = call i32 @llvm.cttz.i32(i32 %x)
+  br label %if.end
+
+if.end:
+  %p = phi i32 [ %sv1, %if.then ], [ %sv2, %if.else ]
+  ret i32 1
+}
+
+declare i32 @llvm.ctlz.i32(i32 %x) readnone
+declare i32 @llvm.cttz.i32(i32 %x) readnone
+
+; CHECK-LABEL: test12
+; CHECK: call i32 @llvm.ctlz
+; CHECK: call i32 @llvm.cttz
+
+; The TBAA metadata should be properly combined.
+define i32 @test13(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %z = load volatile i32, i32* %y
+  %a = add i32 %z, 5
+  store volatile i32 %a, i32* %y, !tbaa !3
+  br label %if.end
+
+if.else:
+  %w = load volatile i32, i32* %y
+  %b = add i32 %w, 7
+  store volatile i32 %b, i32* %y, !tbaa !4
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+!0 = !{ !"an example type tree" }
+!1 = !{ !"int", !0 }
+!2 = !{ !"float", !0 }
+!3 = !{ !"const float", !2, i64 0 }
+!4 = !{ !"special float", !2, i64 1 }
+
+; CHECK-LABEL: test13
+; CHECK-DAG: select
+; CHECK-DAG: load volatile
+; CHECK: store volatile {{.*}}, !tbaa !0
+; CHECK-NOT: load
+; CHECK-NOT: store
+
+; The call should be commoned.
+define i32 @test13a(i1 zeroext %flag, i32 %w, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %sv1 = call i32 @bar(i32 %x)
+  br label %if.end
+
+if.else:
+  %sv2 = call i32 @bar(i32 %y)
+  br label %if.end
+
+if.end:
+  %p = phi i32 [ %sv1, %if.then ], [ %sv2, %if.else ]
+  ret i32 1
+}
+declare i32 @bar(i32)
+
+; CHECK-LABEL: test13a
+; CHECK: %[[x:.*]] = select i1 %flag
+; CHECK: call i32 @bar(i32 %[[x]])
+
+; The load should be commoned.
+define i32 @test14(i1 zeroext %flag, i32 %w, i32 %x, i32 %y, %struct.anon* %s) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %x, 1
+  %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  %sv1 = load i32, i32* %gepa
+  %cmp1 = icmp eq i32 %sv1, 56
+  br label %if.end
+
+if.else:
+  %dummy2 = add i32 %x, 4
+  %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  %sv2 = load i32, i32* %gepb
+  %cmp2 = icmp eq i32 %sv2, 57
+  br label %if.end
+
+if.end:
+  %p = phi i1 [ %cmp1, %if.then ], [ %cmp2, %if.else ]
+  ret i32 1
+}
+
+; CHECK-LABEL: test14
+; CHECK: getelementptr
+; CHECK: load
+; CHECK-NOT: load
+
+; The load should be commoned.
+define i32 @test15(i1 zeroext %flag, i32 %w, i32 %x, i32 %y, %struct.anon* %s) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %x, 1
+  %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 0
+  %sv1 = load i32, i32* %gepa
+  %ext1 = zext i32 %sv1 to i64
+  %cmp1 = icmp eq i64 %ext1, 56
+  br label %if.end
+
+if.else:
+  %dummy2 = add i32 %x, 4
+  %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  %sv2 = load i32, i32* %gepb
+  %ext2 = zext i32 %sv2 to i64
+  %cmp2 = icmp eq i64 %ext2, 56
+  br label %if.end
+
+if.end:
+  %p = phi i1 [ %cmp1, %if.then ], [ %cmp2, %if.else ]
+  ret i32 1
+}
+
+; CHECK-LABEL: test15
+; CHECK: getelementptr
+; CHECK: load
+; CHECK-NOT: load
+
+define zeroext i1 @test_crash(i1 zeroext %flag, i32* %i4, i32* %m, i32* %n) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %tmp1 = load i32, i32* %i4
+  %tmp2 = add i32 %tmp1, -1
+  store i32 %tmp2, i32* %i4
+  br label %if.end
+
+if.else:
+  %tmp3 = load i32, i32* %m
+  %tmp4 = load i32, i32* %n
+  %tmp5 = add i32 %tmp3, %tmp4
+  store i32 %tmp5, i32* %i4
+  br label %if.end
+
+if.end:
+  ret i1 true
+}
+
+; CHECK-LABEL: test_crash
+; No checks for test_crash - just ensure it doesn't crash!
+
+define zeroext i1 @test16(i1 zeroext %flag, i1 zeroext %flag2, i32 %blksA, i32 %blksB, i32 %nblks) {
+
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  br i1 %flag2, label %if.then2, label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.then2 ], [ 0, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+; CHECK-LABEL: test16
+; CHECK: zext
+; CHECK: zext
+
+define zeroext i1 @test16a(i1 zeroext %flag, i1 zeroext %flag2, i32 %blksA, i32 %blksB, i32 %nblks, i8* %p) {
+
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  %b1 = sext i8 %frombool1 to i32
+  %b2 = trunc i32 %b1 to i8
+  store i8 %b2, i8* %p
+  br label %if.end
+
+if.else:
+  br i1 %flag2, label %if.then2, label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  %a1 = sext i8 %frombool3 to i32
+  %a2 = trunc i32 %a1 to i8
+  store i8 %a2, i8* %p
+  br label %if.end
+
+if.end:
+  ret i1 true
+}
+
+; CHECK-LABEL: test16a
+; CHECK: zext
+; CHECK-NOT: zext
+
+define zeroext i1 @test17(i32 %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  switch i32 %flag, label %if.end [
+    i32 0, label %if.then
+    i32 1, label %if.then2
+  ]
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = call i8 @i1toi8(i1 %cmp)
+  %a1 = sext i8 %frombool1 to i32
+  %a2 = trunc i32 %a1 to i8
+  br label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = call i8 @i1toi8(i1 %cmp2)
+  %b1 = sext i8 %frombool3 to i32
+  %b2 = trunc i32 %b1 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %a2, %if.then ], [ %b2, %if.then2 ], [ 0, %entry ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+declare i8 @i1toi8(i1)
+
+; FIXME: DISABLED - we don't consider this profitable. We should
+;  - Consider argument setup/return mov'ing for calls, like InlineCost does.
+;  - Consider the removal of the %obeys.0 PHI (zero PHI movement overall)
+
+; DISABLED-CHECK-LABEL: test17
+; DISABLED-CHECK: if.then:
+; DISABLED-CHECK-NEXT: icmp uge
+; DISABLED-CHECK-NEXT: br label %[[x:.*]]
+
+; DISABLED-CHECK: if.then2:
+; DISABLED-CHECK-NEXT: add
+; DISABLED-CHECK-NEXT: icmp ule
+; DISABLED-CHECK-NEXT: br label %[[x]]
+
+; DISABLED-CHECK: [[x]]:
+; DISABLED-CHECK-NEXT: %[[y:.*]] = phi i1 [ %cmp
+; DISABLED-CHECK-NEXT: %[[z:.*]] = call i8 @i1toi8(i1 %[[y]])
+; DISABLED-CHECK-NEXT: br label %if.end
+
+; DISABLED-CHECK: if.end:
+; DISABLED-CHECK-NEXT: phi i8
+; DISABLED-CHECK-DAG: [ %[[z]], %[[x]] ]
+; DISABLED-CHECK-DAG: [ 0, %entry ]
+
+define zeroext i1 @test18(i32 %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  switch i32 %flag, label %if.then3 [
+    i32 0, label %if.then
+    i32 1, label %if.then2
+  ]
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.then3:
+  %add2 = add i32 %nblks, %blksA
+  %cmp3 = icmp ule i32 %add2, %blksA
+  %frombool4 = zext i1 %cmp3 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.then2 ], [ %frombool4, %if.then3 ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+; CHECK-LABEL: test18
+; CHECK: if.end:
+; CHECK-NEXT: %[[x:.*]] = phi i1
+; CHECK-DAG: [ %cmp, %if.then ]
+; CHECK-DAG: [ %cmp2, %if.then2 ]
+; CHECK-DAG: [ %cmp3, %if.then3 ]
+; CHECK-NEXT: zext i1 %[[x]] to i8
+
+; The phi is confusing - both add instructions are used by it, but
+; not on their respective unconditional arcs. It should not be
+; optimized.
+define void @test_pr30292(i1 %cond, i1 %cond2, i32 %a, i32 %b) {
+entry:
+  %add1 = add i32 %a, 1
+  br label %succ
+
+one:
+  br i1 %cond, label %two, label %succ
+
+two:
+  call void @g()
+  %add2 = add i32 %a, 1
+  br label %succ
+
+succ:
+  %p = phi i32 [ 0, %entry ], [ %add1, %one ], [ %add2, %two ]
+  br label %one
+}
+declare void @g()
+
+; CHECK-LABEL: test_pr30292
+; CHECK: phi i32 [ 0, %entry ], [ %add1, %succ ], [ %add2, %two ]
+
+define zeroext i1 @test_pr30244(i1 zeroext %flag, i1 zeroext %flag2, i32 %blksA, i32 %blksB, i32 %nblks) {
+
+entry:
+  %p = alloca i8
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  store i8 %frombool1, i8* %p
+  br label %if.end
+
+if.else:
+  br i1 %flag2, label %if.then2, label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  store i8 %frombool3, i8* %p
+  br label %if.end
+
+if.end:
+  ret i1 true
+}
+
+; CHECK-LABEL: @test_pr30244
+; CHECK: store
+; CHECK-NOT: store
+
+define i32 @test_pr30373a(i1 zeroext %flag, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
+  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
+  %z0 = lshr i32 %y0, 8
+  br label %if.end
+
+if.else:
+  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
+  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
+  %z1 = lshr exact i32 %y1, 8
+  br label %if.end
+
+if.end:
+  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
+  %yy = phi i32 [ %z0, %if.then ], [ %z1, %if.else ]
+  %ret = add i32 %xx, %yy
+  ret i32 %ret
+}
+
+; CHECK-LABEL: test_pr30373a
+; CHECK: lshr
+; CHECK-NOT: exact
+; CHECK: }
+
+define i32 @test_pr30373b(i1 zeroext %flag, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
+  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
+  %z0 = lshr exact i32 %y0, 8
+  br label %if.end
+
+if.else:
+  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
+  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
+  %z1 = lshr i32 %y1, 8
+  br label %if.end
+
+if.end:
+  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
+  %yy = phi i32 [ %z0, %if.then ], [ %z1, %if.else ]
+  %ret = add i32 %xx, %yy
+  ret i32 %ret
+}
+
+; CHECK-LABEL: test_pr30373b
+; CHECK: lshr
+; CHECK-NOT: exact
+; CHECK: }
+
+; CHECK: !0 = !{!1, !1, i64 0}
+; CHECK: !1 = !{!"float", !2}
+; CHECK: !2 = !{!"an example type tree"}
diff --git a/test/Transforms/GVNSink/struct.ll b/test/Transforms/GVNSink/struct.ll
new file mode 100644
index 000000000000..2228cf2803ae
--- /dev/null
+++ b/test/Transforms/GVNSink/struct.ll
@@ -0,0 +1,71 @@
+; RUN: opt -gvn-sink -S < %s | FileCheck %s
+
+%struct = type {i32, i32}
+%struct2 = type { [ 2 x i32], i32 }
+
+; Struct indices cannot be variant.
+
+; CHECK-LABEL: @f() {
+; CHECK: getelementptr
+; CHECK: getelementptr
+define void @f() {
+bb:
+  br i1 undef, label %bb2, label %bb1
+
+bb1:                                              ; preds = %bb
+  %tmp = getelementptr inbounds %struct, %struct* null, i64 0, i32 1
+  br label %bb4
+
+bb2:                                              ; preds = %bb
+  %tmp3 = getelementptr inbounds %struct, %struct* null, i64 0, i32 0
+  br label %bb4
+
+bb4:                                              ; preds = %bb2, %bb1
+  %tmp5 = phi i32 [ 1, %bb1 ], [ 0, %bb2 ]
+  ret void
+}
+
+; Struct indices cannot be variant.
+
+; CHECK-LABEL: @g() {
+; CHECK: getelementptr
+; CHECK: getelementptr
+define void @g() {
+bb:
+  br i1 undef, label %bb2, label %bb1
+
+bb1:                                              ; preds = %bb
+  %tmp = getelementptr inbounds %struct2, %struct2* null, i64 0, i32 0, i32 1
+  br label %bb4
+
+bb2:                                              ; preds = %bb
+  %tmp3 = getelementptr inbounds %struct2, %struct2* null, i64 0, i32 0, i32 0
+  br label %bb4
+
+bb4:                                              ; preds = %bb2, %bb1
+  %tmp5 = phi i32 [ 1, %bb1 ], [ 0, %bb2 ]
+  ret void
+}
+
+
+; ... but the first parameter to a GEP can.
+
+; CHECK-LABEL: @h() {
+; CHECK: getelementptr
+; CHECK-NOT: getelementptr
+define void @h() {
+bb:
+  br i1 undef, label %bb2, label %bb1
+
+bb1:                                              ; preds = %bb
+  %tmp = getelementptr inbounds %struct, %struct* null, i32 0, i32 0
+  br label %bb4
+
+bb2:                                              ; preds = %bb
+  %tmp3 = getelementptr inbounds %struct, %struct* null, i32 1, i32 0
+  br label %bb4
+
+bb4:                                              ; preds = %bb2, %bb1
+  %tmp5 = phi i32 [ 0, %bb1 ], [ 1, %bb2 ]
+  ret void
+}
+\ No newline at end of file
diff --git a/test/Transforms/GlobalDCE/externally_available.ll b/test/Transforms/GlobalDCE/externally_available.ll
index fca49b29ec8e..bc54db38cee0 100644
--- a/test/Transforms/GlobalDCE/externally_available.ll
+++ b/test/Transforms/GlobalDCE/externally_available.ll
@@ -1,12 +1,21 @@
 ; RUN: opt < %s -globaldce -S | FileCheck %s
 
+; test_global should not be emitted to the .s file.
+; CHECK-NOT: @test_global =
+@test_global = available_externally global i32 4
+
+; test_global2 is a normal global using an available externally function.
+; CHECK: @test_global2 =
+@test_global2 = global i32 ()* @test_function2
+
 ; test_function should not be emitted to the .s file.
-; CHECK-NOT: test_function
+; CHECK-NOT: define {{.*}} @test_function()
 define available_externally i32 @test_function() {
   ret i32 4
 }
 
-; test_global should not be emitted to the .s file.
-; CHECK-NOT: test_global
-@test_global = available_externally global i32 4
-
+; test_function2 isn't actually dead even though it's available externally.
+; CHECK: define available_externally i32 @test_function2()
+define available_externally i32 @test_function2() {
+  ret i32 4
+}
diff --git a/test/Transforms/Inline/prof-update-instr.ll b/test/Transforms/Inline/prof-update-instr.ll
new file mode 100644
index 000000000000..6650165cb904
--- /dev/null
+++ b/test/Transforms/Inline/prof-update-instr.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -S | FileCheck %s
+; Checks if inliner updates VP metadata for indrect call instructions
+; with instrumentation based profile.
+
+@func = global void ()* null
+@func2 = global void ()* null
+
+; CHECK: define void @callee(i32 %n) !prof ![[ENTRY_COUNT:[0-9]*]]
+define void  @callee(i32 %n) !prof !15 {
+  %cond = icmp sle i32 %n, 10
+  br i1 %cond, label %cond_true, label %cond_false, !prof !20
+cond_true:
+; f2 is optimized away, thus not updated.
+  %f2 = load void ()*, void ()** @func2
+; CHECK: call void %f2(), !prof ![[COUNT_IND_CALLEE1:[0-9]*]]
+  call void %f2(), !prof !19
+  ret void
+cond_false:
+  %f = load void ()*, void ()** @func
+; CHECK: call void %f(), !prof ![[COUNT_IND_CALLEE:[0-9]*]]
+  call void %f(), !prof !18
+  ret void
+}
+
+; CHECK: define void @caller()
+define void @caller() !prof !21 {
+; CHECK: call void %f.i(), !prof ![[COUNT_IND_CALLER:[0-9]*]]
+  call void @callee(i32 15)
+  ret void
+}
+
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 10}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 2000}
+!8 = !{!"NumCounts", i64 2}
+!9 = !{!"NumFunctions", i64 2}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
+!15 = !{!"function_entry_count", i64 1000}
+!16 = !{!"branch_weights", i64 2000}
+!18 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20}
+!19 = !{!"VP", i32 0, i64 200, i64 111, i64 100, i64 222, i64 60, i64 333, i64 40}
+!20 = !{!"branch_weights", i32 1000, i32 1000}
+!21 = !{!"function_entry_count", i64 400}
+attributes #0 = { alwaysinline }
+; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600}
+; CHECK: ![[COUNT_IND_CALLEE1]] = !{!"VP", i32 0, i64 200, i64 111, i64 100, i64 222, i64 60, i64 333, i64 40}
+; CHECK: ![[COUNT_IND_CALLEE]] = !{!"VP", i32 0, i64 84, i64 111, i64 48, i64 222, i64 24, i64 333, i64 12}
+; CHECK: ![[COUNT_IND_CALLER]] = !{!"VP", i32 0, i64 56, i64 111, i64 32, i64 222, i64 16, i64 333, i64 8}
diff --git a/test/Transforms/Inline/prof-update.ll b/test/Transforms/Inline/prof-update-sample.ll
index 4a4471e8e17a..4a4471e8e17a 100644
--- a/test/Transforms/Inline/prof-update.ll
+++ b/test/Transforms/Inline/prof-update-sample.ll
diff --git a/test/Transforms/InstCombine/2008-07-10-ICmpBinOp.ll b/test/Transforms/InstCombine/2008-07-10-ICmpBinOp.ll
deleted file mode 100644
index 76e30399a666..000000000000
--- a/test/Transforms/InstCombine/2008-07-10-ICmpBinOp.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: opt < %s -instcombine -S | not grep add
-; RUN: opt < %s -instcombine -S | not grep mul
-; PR2330
-
-define i1 @f(i32 %x, i32 %y) nounwind {
-entry:
-  %A = add i32 %x, 5
-  %B = add i32 %y, 5
-  %C = icmp eq i32 %A, %B
-  ret i1 %C
-}
-
-define i1 @g(i32 %x, i32 %y) nounwind {
-entry:
-  %A = mul i32 %x, 5
-  %B = mul i32 %y, 5
-  %C = icmp eq i32 %A, %B
-  ret i1 %C
-}
diff --git a/test/Transforms/InstCombine/2008-08-17-ICmpXorSignbit.ll b/test/Transforms/InstCombine/2008-08-17-ICmpXorSignbit.ll
deleted file mode 100644
index b91457c79dea..000000000000
--- a/test/Transforms/InstCombine/2008-08-17-ICmpXorSignbit.ll
+++ /dev/null
@@ -1,87 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-define i1 @test1(i8 %x, i8 %y) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp ult i8 %x, %y
-; CHECK-NEXT:    ret i1 [[TMP]]
-;
-  %X = xor i8 %x, 128
-  %Y = xor i8 %y, 128
-  %tmp = icmp slt i8 %X, %Y
-  ret i1 %tmp
-}
-
-define i1 @test2(i8 %x, i8 %y) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp slt i8 %x, %y
-; CHECK-NEXT:    ret i1 [[TMP]]
-;
-  %X = xor i8 %x, 128
-  %Y = xor i8 %y, 128
-  %tmp = icmp ult i8 %X, %Y
-  ret i1 %tmp
-}
-
-define i1 @test3(i8 %x) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp sgt i8 %x, -114
-; CHECK-NEXT:    ret i1 [[TMP]]
-;
-  %X = xor i8 %x, 128
-  %tmp = icmp uge i8 %X, 15
-  ret i1 %tmp
-}
-
-define <2 x i1> @test3vec(<2 x i8> %x) {
-; CHECK-LABEL: @test3vec(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp sgt <2 x i8> %x, <i8 -114, i8 -114>
-; CHECK-NEXT:    ret <2 x i1> [[TMP]]
-;
-  %X = xor <2 x i8> %x, <i8 128, i8 128>
-  %tmp = icmp uge <2 x i8> %X, <i8 15, i8 15>
-  ret <2 x i1> %tmp
-}
-
-define i1 @test4(i8 %x, i8 %y) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp ugt i8 %x, %y
-; CHECK-NEXT:    ret i1 [[TMP]]
-;
-  %X = xor i8 %x, 127
-  %Y = xor i8 %y, 127
-  %tmp = icmp slt i8 %X, %Y
-  ret i1 %tmp
-}
-
-define i1 @test5(i8 %x, i8 %y) {
-; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp sgt i8 %x, %y
-; CHECK-NEXT:    ret i1 [[TMP]]
-;
-  %X = xor i8 %x, 127
-  %Y = xor i8 %y, 127
-  %tmp = icmp ult i8 %X, %Y
-  ret i1 %tmp
-}
-
-define i1 @test6(i8 %x) {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp slt i8 %x, 113
-; CHECK-NEXT:    ret i1 [[TMP]]
-;
-  %X = xor i8 %x, 127
-  %tmp = icmp uge i8 %X, 15
-  ret i1 %tmp
-}
-
-define <2 x i1> @test6vec(<2 x i8> %x) {
-; CHECK-LABEL: @test6vec(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp slt <2 x i8> %x, <i8 113, i8 113>
-; CHECK-NEXT:    ret <2 x i1> [[TMP]]
-;
-  %X = xor <2 x i8> %x, <i8 127, i8 127>
-  %tmp = icmp uge <2 x i8> %X, <i8 15, i8 15>
-  ret <2 x i1> %tmp
-}
-
diff --git a/test/Transforms/InstCombine/alloca.ll b/test/Transforms/InstCombine/alloca.ll
index f81f700e6cf4..490830af2d82 100644
--- a/test/Transforms/InstCombine/alloca.ll
+++ b/test/Transforms/InstCombine/alloca.ll
@@ -51,8 +51,8 @@ define i32* @test4(i32 %n) {
   ret i32* %A
 }
 
-; Allocas which are only used by GEPs, bitcasts, and stores (transitively)
-; should be deleted.
+; Allocas which are only used by GEPs, bitcasts, addrspacecasts, and stores
+; (transitively) should be deleted.
 define void @test5() {
 ; CHECK-LABEL: @test5(
 ; CHECK-NOT: alloca
@@ -62,6 +62,7 @@ define void @test5() {
 entry:
   %a = alloca { i32 }
   %b = alloca i32*
+  %c = alloca i32
   %a.1 = getelementptr { i32 }, { i32 }* %a, i32 0, i32 0
   store i32 123, i32* %a.1
   store i32* %a.1, i32** %b
@@ -73,6 +74,8 @@ entry:
   store atomic i32 3, i32* %a.3 release, align 4
   %a.4 = getelementptr { i32 }, { i32 }* %a, i32 0, i32 0
   store atomic i32 4, i32* %a.4 seq_cst, align 4
+  %c.1 = addrspacecast i32* %c to i32 addrspace(1)*
+  store i32 123, i32 addrspace(1)* %c.1
   ret void
 }
 
diff --git a/test/Transforms/InstCombine/bitcast-vec-canon.ll b/test/Transforms/InstCombine/bitcast-vec-canon.ll
index 97145221099e..a92a7b73fd7e 100644
--- a/test/Transforms/InstCombine/bitcast-vec-canon.ll
+++ b/test/Transforms/InstCombine/bitcast-vec-canon.ll
@@ -1,41 +1,40 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define double @a(<1 x i64> %y) {
+; CHECK-LABEL: @a(
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <1 x i64> %y to <1 x double>
+; CHECK-NEXT:    [[C:%.*]] = extractelement <1 x double> [[BC]], i32 0
+; CHECK-NEXT:    ret double [[C]]
+;
   %c = bitcast <1 x i64> %y to double
   ret double %c
- 
-; CHECK-LABEL: @a(
-; CHECK-NEXT:  bitcast <1 x i64> %y to <1 x double>
-; CHECK-NEXT:  extractelement <1 x double> {{.*}}, i32 0
-; CHECK-NEXT:  ret double
 }
 
 define i64 @b(<1 x i64> %y) {
+; CHECK-LABEL: @b(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> %y, i32 0
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
   %c = bitcast <1 x i64> %y to i64
   ret i64 %c
-
-; CHECK-LABEL: @b(
-; CHECK-NEXT:  extractelement <1 x i64> %y, i32 0
-; CHECK-NEXT:  ret i64
 }
 
 define <1 x i64> @c(double %y) {
+; CHECK-LABEL: @c(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double %y to i64
+; CHECK-NEXT:    [[C:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
+; CHECK-NEXT:    ret <1 x i64> [[C]]
+;
   %c = bitcast double %y to <1 x i64>
   ret <1 x i64> %c
-
-; CHECK-LABEL: @c(
-; CHECK-NEXT:  bitcast double %y to i64
-; CHECK-NEXT:  insertelement <1 x i64> undef, i64 {{.*}}, i32 0
-; CHECK-NEXT:  ret <1 x i64>
 }
 
 define <1 x i64> @d(i64 %y) {
+; CHECK-LABEL: @d(
+; CHECK-NEXT:    [[C:%.*]] = insertelement <1 x i64> undef, i64 %y, i32 0
+; CHECK-NEXT:    ret <1 x i64> [[C]]
+;
   %c = bitcast i64 %y to <1 x i64>
   ret <1 x i64> %c
-
-; CHECK-LABEL: @d(
-; CHECK-NEXT:  insertelement <1 x i64> undef, i64 %y, i32 0
-; CHECK-NEXT:  ret <1 x i64>
 }
 
-
diff --git a/test/Transforms/InstCombine/bitcast.ll b/test/Transforms/InstCombine/bitcast.ll
index 2e7f30fee14d..4cf3f27ab014 100644
--- a/test/Transforms/InstCombine/bitcast.ll
+++ b/test/Transforms/InstCombine/bitcast.ll
@@ -70,6 +70,51 @@ define <2 x i32> @or_bitcast_int_to_vec(i64 %a) {
   ret <2 x i32> %t2
 }
 
+; PR26702 - https://bugs.llvm.org//show_bug.cgi?id=26702
+; Bitcast is canonicalized below logic, so we can see the not-not pattern.
+
+define <2 x i64> @is_negative(<4 x i32> %x) {
+; CHECK-LABEL: @is_negative(
+; CHECK-NEXT:    [[LOBIT:%.*]] = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    [[NOTNOT:%.*]] = bitcast <4 x i32> [[LOBIT]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[NOTNOT]]
+;
+  %lobit = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+  %not = xor <4 x i32> %lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc = bitcast <4 x i32> %not to <2 x i64>
+  %notnot = xor <2 x i64> %bc, <i64 -1, i64 -1>
+  ret <2 x i64> %notnot
+}
+
+; This variation has an extra bitcast at the end. This means that the 2nd xor
+; can be done in <4 x i32> to eliminate a bitcast regardless of canonicalizaion.
+
+define <4 x i32> @is_negative_bonus_bitcast(<4 x i32> %x) {
+; CHECK-LABEL: @is_negative_bonus_bitcast(
+; CHECK-NEXT:    [[LOBIT:%.*]] = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <4 x i32> [[LOBIT]]
+;
+  %lobit = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+  %not = xor <4 x i32> %lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc = bitcast <4 x i32> %not to <2 x i64>
+  %notnot = xor <2 x i64> %bc, <i64 -1, i64 -1>
+  %bc2 = bitcast <2 x i64> %notnot to <4 x i32>
+  ret <4 x i32> %bc2
+}
+
+; Negative test: bitcasts are canonicalized below bitwise logic. No changes here.
+
+define <2 x i8> @canonicalize_bitcast_logic_with_constant(<4 x i4> %x) {
+; CHECK-LABEL: @canonicalize_bitcast_logic_with_constant(
+; CHECK-NEXT:    [[A:%.*]] = and <4 x i4> %x, <i4 0, i4 -8, i4 0, i4 -8>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i4> [[A]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[B]]
+;
+  %a = and <4 x i4> %x, <i4 0, i4 8, i4 0, i4 8>
+  %b = bitcast <4 x i4> %a to <2 x i8>
+  ret <2 x i8> %b
+}
+
 ; PR27925 - https://llvm.org/bugs/show_bug.cgi?id=27925
 
 define <4 x i32> @bitcasts_and_bitcast(<4 x i32> %a, <8 x i16> %b) {
diff --git a/test/Transforms/InstCombine/ctpop.ll b/test/Transforms/InstCombine/ctpop.ll
index 38612c92aaa4..6bc6f9731979 100644
--- a/test/Transforms/InstCombine/ctpop.ll
+++ b/test/Transforms/InstCombine/ctpop.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -S -instcombine | FileCheck %s
 
 declare i32 @llvm.ctpop.i32(i32)
@@ -5,8 +6,9 @@ declare i8 @llvm.ctpop.i8(i8)
 declare void @llvm.assume(i1)
 
 define i1 @test1(i32 %arg) {
-; CHECK: @test1
-; CHECK: ret i1 false
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i1 false
+;
   %and = and i32 %arg, 15
   %cnt = call i32 @llvm.ctpop.i32(i32 %and)
   %res = icmp eq i32 %cnt, 9
@@ -14,8 +16,9 @@ define i1 @test1(i32 %arg) {
 }
 
 define i1 @test2(i32 %arg) {
-; CHECK: @test2
-; CHECK: ret i1 false
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    ret i1 false
+;
   %and = and i32 %arg, 1
   %cnt = call i32 @llvm.ctpop.i32(i32 %and)
   %res = icmp eq i32 %cnt, 2
@@ -23,9 +26,12 @@ define i1 @test2(i32 %arg) {
 }
 
 define i1 @test3(i32 %arg) {
-; CHECK: @test3
-; CHECK: ret i1 false
-  ;; Use an assume to make all the bits known without triggering constant 
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[ASSUME:%.*]] = icmp eq i32 [[ARG:%.*]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ASSUME]])
+; CHECK-NEXT:    ret i1 false
+;
+  ;; Use an assume to make all the bits known without triggering constant
   ;; folding.  This is trying to hit a corner case where we have to avoid
   ;; taking the log of 0.
   %assume = icmp eq i32 %arg, 0
@@ -37,8 +43,11 @@ define i1 @test3(i32 %arg) {
 
 ; Negative test for when we know nothing
 define i1 @test4(i8 %arg) {
-; CHECK: @test4
-; CHECK: ret i1 %res
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[CNT:%.*]] = call i8 @llvm.ctpop.i8(i8 [[ARG:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i8 [[CNT]], 2
+; CHECK-NEXT:    ret i1 [[RES]]
+;
   %cnt = call i8 @llvm.ctpop.i8(i8 %arg)
   %res = icmp eq i8 %cnt, 2
   ret i1 %res
diff --git a/test/Transforms/InstCombine/icmp-xor-signbit.ll b/test/Transforms/InstCombine/icmp-xor-signbit.ll
new file mode 100644
index 000000000000..30a9668f37df
--- /dev/null
+++ b/test/Transforms/InstCombine/icmp-xor-signbit.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
+
+define i1 @slt_to_ult(i8 %x, i8 %y) {
+; CHECK-LABEL: @slt_to_ult(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 128
+  %b = xor i8 %y, 128
+  %cmp = icmp slt i8 %a, %b
+  ret i1 %cmp
+}
+
+; PR33138 - https://bugs.llvm.org/show_bug.cgi?id=33138
+
+define <2 x i1> @slt_to_ult_splat(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @slt_to_ult_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <2 x i8> %x, %y
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 128, i8 128>
+  %b = xor <2 x i8> %y, <i8 128, i8 128>
+  %cmp = icmp slt <2 x i8> %a, %b
+  ret <2 x i1> %cmp
+}
+
+; Make sure that unsigned -> signed works too.
+
+define i1 @ult_to_slt(i8 %x, i8 %y) {
+; CHECK-LABEL: @ult_to_slt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 128
+  %b = xor i8 %y, 128
+  %cmp = icmp ult i8 %a, %b
+  ret i1 %cmp
+}
+
+define <2 x i1> @ult_to_slt_splat(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @ult_to_slt_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> %x, %y
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 128, i8 128>
+  %b = xor <2 x i8> %y, <i8 128, i8 128>
+  %cmp = icmp ult <2 x i8> %a, %b
+  ret <2 x i1> %cmp
+}
+
+; icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b
+
+define i1 @slt_to_ugt(i8 %x, i8 %y) {
+; CHECK-LABEL: @slt_to_ugt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 127
+  %b = xor i8 %y, 127
+  %cmp = icmp slt i8 %a, %b
+  ret i1 %cmp
+}
+
+define <2 x i1> @slt_to_ugt_splat(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @slt_to_ugt_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <2 x i8> %x, %y
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 127, i8 127>
+  %b = xor <2 x i8> %y, <i8 127, i8 127>
+  %cmp = icmp slt <2 x i8> %a, %b
+  ret <2 x i1> %cmp
+}
+
+; Make sure that unsigned -> signed works too.
+
+define i1 @ult_to_sgt(i8 %x, i8 %y) {
+; CHECK-LABEL: @ult_to_sgt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 127
+  %b = xor i8 %y, 127
+  %cmp = icmp ult i8 %a, %b
+  ret i1 %cmp
+}
+
+define <2 x i1> @ult_to_sgt_splat(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @ult_to_sgt_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i8> %x, %y
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 127, i8 127>
+  %b = xor <2 x i8> %y, <i8 127, i8 127>
+  %cmp = icmp ult <2 x i8> %a, %b
+  ret <2 x i1> %cmp
+}
+
+; icmp u/s (a ^ signmask), C --> icmp s/u a, C'
+
+define i1 @sge_to_ugt(i8 %x) {
+; CHECK-LABEL: @sge_to_ugt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 %x, -114
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 128
+  %cmp = icmp sge i8 %a, 15
+  ret i1 %cmp
+}
+
+define <2 x i1> @sge_to_ugt_splat(<2 x i8> %x) {
+; CHECK-LABEL: @sge_to_ugt_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <2 x i8> %x, <i8 -114, i8 -114>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 128, i8 128>
+  %cmp = icmp sge <2 x i8> %a, <i8 15, i8 15>
+  ret <2 x i1> %cmp
+}
+
+; Make sure that unsigned -> signed works too.
+
+define i1 @uge_to_sgt(i8 %x) {
+; CHECK-LABEL: @uge_to_sgt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 %x, -114
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 128
+  %cmp = icmp uge i8 %a, 15
+  ret i1 %cmp
+}
+
+define <2 x i1> @uge_to_sgt_splat(<2 x i8> %x) {
+; CHECK-LABEL: @uge_to_sgt_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i8> %x, <i8 -114, i8 -114>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 128, i8 128>
+  %cmp = icmp uge <2 x i8> %a, <i8 15, i8 15>
+  ret <2 x i1> %cmp
+}
+
+; icmp u/s (a ^ maxsignval), C --> icmp s/u' a, C'
+
+define i1 @sge_to_ult(i8 %x) {
+; CHECK-LABEL: @sge_to_ult(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 %x, 113
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 127
+  %cmp = icmp sge i8 %a, 15
+  ret i1 %cmp
+}
+
+define <2 x i1> @sge_to_ult_splat(<2 x i8> %x) {
+; CHECK-LABEL: @sge_to_ult_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <2 x i8> %x, <i8 113, i8 113>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 127, i8 127>
+  %cmp = icmp sge <2 x i8> %a, <i8 15, i8 15>
+  ret <2 x i1> %cmp
+}
+
+; Make sure that unsigned -> signed works too.
+
+define i1 @uge_to_slt(i8 %x) {
+; CHECK-LABEL: @uge_to_slt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 %x, 113
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 127
+  %cmp = icmp uge i8 %a, 15
+  ret i1 %cmp
+}
+
+define <2 x i1> @uge_to_slt_splat(<2 x i8> %x) {
+; CHECK-LABEL: @uge_to_slt_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> %x, <i8 113, i8 113>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 127, i8 127>
+  %cmp = icmp uge <2 x i8> %a, <i8 15, i8 15>
+  ret <2 x i1> %cmp
+}
+
+; PR33138, part 2: https://bugs.llvm.org/show_bug.cgi?id=33138
+; TODO: We could look through vector bitcasts for icmp folds,
+; or we could canonicalize bitcast ahead of logic ops with constants.
+
+define <8 x i1> @sgt_to_ugt_bitcasted_splat(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @sgt_to_ugt_bitcasted_splat(
+; CHECK-NEXT:    [[A:%.*]] = xor <2 x i32> %x, <i32 -2139062144, i32 -2139062144>
+; CHECK-NEXT:    [[B:%.*]] = xor <2 x i32> %y, <i32 -2139062144, i32 -2139062144>
+; CHECK-NEXT:    [[C:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[D:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[E:%.*]] = icmp sgt <8 x i8> [[C]], [[D]]
+; CHECK-NEXT:    ret <8 x i1> [[E]]
+;
+  %a = xor <2 x i32> %x, <i32 2155905152, i32 2155905152> ; 0x80808080
+  %b = xor <2 x i32> %y, <i32 2155905152, i32 2155905152>
+  %c = bitcast <2 x i32> %a to <8 x i8>
+  %d = bitcast <2 x i32> %b to <8 x i8>
+  %e = icmp sgt <8 x i8> %c, %d
+  ret <8 x i1> %e
+}
+
+; TODO: This is false (little-endian). How should that be recognized?
+; Ie, should InstSimplify know this directly, should InstCombine canonicalize
+; this so InstSimplify can know this, or is that not something that we want
+; either pass to recognize?
+
+define <2 x i1> @negative_simplify_splat(<4 x i8> %x) {
+; CHECK-LABEL: @negative_simplify_splat(
+; CHECK-NEXT:    [[A:%.*]] = or <4 x i8> %x, <i8 0, i8 -128, i8 0, i8 -128>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i8> [[A]] to <2 x i16>
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt <2 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %a = or <4 x i8> %x, <i8 0, i8 128, i8 0, i8 128>
+  %b = bitcast <4 x i8> %a to <2 x i16>
+  %c = icmp sgt <2 x i16> %b, zeroinitializer
+  ret <2 x i1> %c
+}
+
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 6f657b190454..ed570da73c9e 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -2895,3 +2895,67 @@ define i1 @cmp_ult_rhs_dec(float %x, i32 %y) {
   %cmp = icmp ult i32 %conv, %dec
   ret i1 %cmp
 }
+
+define i1 @eq_add_constants(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_add_constants(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 %x, %y
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %A = add i32 %x, 5
+  %B = add i32 %y, 5
+  %C = icmp eq i32 %A, %B
+  ret i1 %C
+}
+
+define i1 @eq_mul_constants(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_mul_constants(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 %x, %y
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %A = mul i32 %x, 5
+  %B = mul i32 %y, 5
+  %C = icmp eq i32 %A, %B
+  ret i1 %C
+}
+
+define <2 x i1> @eq_mul_constants_splat(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @eq_mul_constants_splat(
+; CHECK-NEXT:    [[C:%.*]] = icmp ne <2 x i32> %x, %y
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %A = mul <2 x i32> %x, <i32 5, i32 5>
+  %B = mul <2 x i32> %y, <i32 5, i32 5>
+  %C = icmp ne <2 x i32> %A, %B
+  ret <2 x i1> %C
+}
+
+; If the multiply constant has any trailing zero bits, we get something completely different.
+; We mask off the high bits of each input and then convert:
+; (X&Z) == (Y&Z) -> (X^Y) & Z == 0
+
+define i1 @eq_mul_constants_with_tz(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_mul_constants_with_tz(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %x, %y
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 1073741823
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %A = mul i32 %x, 12
+  %B = mul i32 %y, 12
+  %C = icmp ne i32 %A, %B
+  ret i1 %C
+}
+
+define <2 x i1> @eq_mul_constants_with_tz_splat(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @eq_mul_constants_with_tz_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i32> %x, %y
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 1073741823, i32 1073741823>
+; CHECK-NEXT:    [[C:%.*]] = icmp eq <2 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %A = mul <2 x i32> %x, <i32 12, i32 12>
+  %B = mul <2 x i32> %y, <i32 12, i32 12>
+  %C = icmp eq <2 x i32> %A, %B
+  ret <2 x i1> %C
+}
+
diff --git a/test/Transforms/InstSimplify/call.ll b/test/Transforms/InstSimplify/call.ll
index 988ec2b71c50..68daac65ee6b 100644
--- a/test/Transforms/InstSimplify/call.ll
+++ b/test/Transforms/InstSimplify/call.ll
@@ -1,64 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
 
 declare {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
+declare {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a, i8 %b)
 declare {i8, i1} @llvm.usub.with.overflow.i8(i8 %a, i8 %b)
 declare {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a, i8 %b)
 declare {i8, i1} @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
+declare {i8, i1} @llvm.smul.with.overflow.i8(i8 %a, i8 %b)
 
 define i1 @test_uadd1() {
 ; CHECK-LABEL: @test_uadd1(
+; CHECK-NEXT:    ret i1 true
+;
   %x = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 254, i8 3)
   %overflow = extractvalue {i8, i1} %x, 1
   ret i1 %overflow
-; CHECK-NEXT: ret i1 true
 }
 
 define i8 @test_uadd2() {
 ; CHECK-LABEL: @test_uadd2(
+; CHECK-NEXT:    ret i8 42
+;
   %x = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 254, i8 44)
   %result = extractvalue {i8, i1} %x, 0
   ret i8 %result
-; CHECK-NEXT: ret i8 42
+}
+
+define {i8, i1} @test_uadd3(i8 %v) {
+; CHECK-LABEL: @test_uadd3(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %result = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %v, i8 undef)
+  ret {i8, i1} %result
+}
+
+define {i8, i1} @test_uadd4(i8 %v) {
+; CHECK-LABEL: @test_uadd4(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %result = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 undef, i8 %v)
+  ret {i8, i1} %result
+}
+
+define i1 @test_sadd1() {
+; CHECK-LABEL: @test_sadd1(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 126, i8 3)
+  %overflow = extractvalue {i8, i1} %x, 1
+  ret i1 %overflow
+}
+
+define i8 @test_sadd2() {
+; CHECK-LABEL: @test_sadd2(
+; CHECK-NEXT:    ret i8 -86
+;
+  %x = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 126, i8 44)
+  %result = extractvalue {i8, i1} %x, 0
+  ret i8 %result
+}
+
+define {i8, i1} @test_sadd3(i8 %v) {
+; CHECK-LABEL: @test_sadd3(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %result = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v, i8 undef)
+  ret {i8, i1} %result
+}
+
+define {i8, i1} @test_sadd4(i8 %v) {
+; CHECK-LABEL: @test_sadd4(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %result = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 undef, i8 %v)
+  ret {i8, i1} %result
 }
 
 define {i8, i1} @test_usub1(i8 %V) {
 ; CHECK-LABEL: @test_usub1(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
   %x = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %V, i8 %V)
   ret {i8, i1} %x
-; CHECK-NEXT: ret { i8, i1 } zeroinitializer
+}
+
+define {i8, i1} @test_usub2(i8 %V) {
+; CHECK-LABEL: @test_usub2(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %x = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %V, i8 undef)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_usub3(i8 %V) {
+; CHECK-LABEL: @test_usub3(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %x = call {i8, i1} @llvm.usub.with.overflow.i8(i8 undef, i8 %V)
+  ret {i8, i1} %x
 }
 
 define {i8, i1} @test_ssub1(i8 %V) {
 ; CHECK-LABEL: @test_ssub1(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
   %x = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %V, i8 %V)
   ret {i8, i1} %x
-; CHECK-NEXT: ret { i8, i1 } zeroinitializer
+}
+
+define {i8, i1} @test_ssub2(i8 %V) {
+; CHECK-LABEL: @test_ssub2(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %x = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %V, i8 undef)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_ssub3(i8 %V) {
+; CHECK-LABEL: @test_ssub3(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %x = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 undef, i8 %V)
+  ret {i8, i1} %x
 }
 
 define {i8, i1} @test_umul1(i8 %V) {
 ; CHECK-LABEL: @test_umul1(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
   %x = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %V, i8 0)
   ret {i8, i1} %x
-; CHECK-NEXT: ret { i8, i1 } zeroinitializer
+}
+
+define {i8, i1} @test_umul2(i8 %V) {
+; CHECK-LABEL: @test_umul2(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %V, i8 undef)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_umul3(i8 %V) {
+; CHECK-LABEL: @test_umul3(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.umul.with.overflow.i8(i8 0, i8 %V)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_umul4(i8 %V) {
+; CHECK-LABEL: @test_umul4(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.umul.with.overflow.i8(i8 undef, i8 %V)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_smul1(i8 %V) {
+; CHECK-LABEL: @test_smul1(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %V, i8 0)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_smul2(i8 %V) {
+; CHECK-LABEL: @test_smul2(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %V, i8 undef)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_smul3(i8 %V) {
+; CHECK-LABEL: @test_smul3(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.smul.with.overflow.i8(i8 0, i8 %V)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_smul4(i8 %V) {
+; CHECK-LABEL: @test_smul4(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.smul.with.overflow.i8(i8 undef, i8 %V)
+  ret {i8, i1} %x
 }
 
 declare i256 @llvm.cttz.i256(i256 %src, i1 %is_zero_undef)
 
 define i256 @test_cttz() {
 ; CHECK-LABEL: @test_cttz(
+; CHECK-NEXT:    ret i256 1
+;
   %x = call i256 @llvm.cttz.i256(i256 10, i1 false)
   ret i256 %x
-; CHECK-NEXT: ret i256 1
 }
 
 declare i256 @llvm.ctpop.i256(i256 %src)
 
 define i256 @test_ctpop() {
 ; CHECK-LABEL: @test_ctpop(
+; CHECK-NEXT:    ret i256 2
+;
   %x = call i256 @llvm.ctpop.i256(i256 10)
   ret i256 %x
-; CHECK-NEXT: ret i256 2
 }
 
 ; Test a non-intrinsic that we know about as a library call.
@@ -66,14 +214,15 @@ declare float @fabs(float %x)
 
 define float @test_fabs_libcall() {
 ; CHECK-LABEL: @test_fabs_libcall(
+; CHECK-NEXT:    [[X:%.*]] = call float @fabs(float -4.200000e+01)
+; CHECK-NEXT:    ret float 4.200000e+01
+;
 
   %x = call float @fabs(float -42.0)
 ; This is still a real function call, so instsimplify won't nuke it -- other
 ; passes have to do that.
-; CHECK-NEXT: call float @fabs
 
   ret float %x
-; CHECK-NEXT: ret float 4.2{{0+}}e+01
 }
 
 
@@ -87,34 +236,35 @@ declare float @llvm.nearbyint.f32(float) nounwind readnone
 ; Test idempotent intrinsics
 define float @test_idempotence(float %a) {
 ; CHECK-LABEL: @test_idempotence(
+; CHECK-NEXT:    [[A0:%.*]] = call float @llvm.fabs.f32(float [[A:%.*]])
+; CHECK-NEXT:    [[B0:%.*]] = call float @llvm.floor.f32(float [[A]])
+; CHECK-NEXT:    [[C0:%.*]] = call float @llvm.ceil.f32(float [[A]])
+; CHECK-NEXT:    [[D0:%.*]] = call float @llvm.trunc.f32(float [[A]])
+; CHECK-NEXT:    [[E0:%.*]] = call float @llvm.rint.f32(float [[A]])
+; CHECK-NEXT:    [[F0:%.*]] = call float @llvm.nearbyint.f32(float [[A]])
+; CHECK-NEXT:    [[R0:%.*]] = fadd float [[A0]], [[B0]]
+; CHECK-NEXT:    [[R1:%.*]] = fadd float [[R0]], [[C0]]
+; CHECK-NEXT:    [[R2:%.*]] = fadd float [[R1]], [[D0]]
+; CHECK-NEXT:    [[R3:%.*]] = fadd float [[R2]], [[E0]]
+; CHECK-NEXT:    [[R4:%.*]] = fadd float [[R3]], [[F0]]
+; CHECK-NEXT:    ret float [[R4]]
+;
 
-; CHECK: fabs
-; CHECK-NOT: fabs
   %a0 = call float @llvm.fabs.f32(float %a)
   %a1 = call float @llvm.fabs.f32(float %a0)
 
-; CHECK: floor
-; CHECK-NOT: floor
   %b0 = call float @llvm.floor.f32(float %a)
   %b1 = call float @llvm.floor.f32(float %b0)
 
-; CHECK: ceil
-; CHECK-NOT: ceil
   %c0 = call float @llvm.ceil.f32(float %a)
   %c1 = call float @llvm.ceil.f32(float %c0)
 
-; CHECK: trunc
-; CHECK-NOT: trunc
   %d0 = call float @llvm.trunc.f32(float %a)
   %d1 = call float @llvm.trunc.f32(float %d0)
 
-; CHECK: rint
-; CHECK-NOT: rint
   %e0 = call float @llvm.rint.f32(float %a)
   %e1 = call float @llvm.rint.f32(float %e0)
 
-; CHECK: nearbyint
-; CHECK-NOT: nearbyint
   %f0 = call float @llvm.nearbyint.f32(float %a)
   %f1 = call float @llvm.nearbyint.f32(float %f0)
 
@@ -128,6 +278,17 @@ define float @test_idempotence(float %a) {
 }
 
 define i8* @operator_new() {
+; CHECK-LABEL: @operator_new(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call noalias i8* @_Znwm(i64 8)
+; CHECK-NEXT:    br i1 false, label [[CAST_END:%.*]], label [[CAST_NOTNULL:%.*]]
+; CHECK:       cast.notnull:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 4
+; CHECK-NEXT:    br label [[CAST_END]]
+; CHECK:       cast.end:
+; CHECK-NEXT:    [[CAST_RESULT:%.*]] = phi i8* [ [[ADD_PTR]], [[CAST_NOTNULL]] ], [ null, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8* [[CAST_RESULT]]
+;
 entry:
   %call = tail call noalias i8* @_Znwm(i64 8)
   %cmp = icmp eq i8* %call, null
@@ -141,8 +302,6 @@ cast.end:                                         ; preds = %cast.notnull, %entr
   %cast.result = phi i8* [ %add.ptr, %cast.notnull ], [ null, %entry ]
   ret i8* %cast.result
 
-; CHECK-LABEL: @operator_new
-; CHECK: br i1 false, label %cast.end, label %cast.notnull
 }
 
 declare nonnull noalias i8* @_Znwm(i64)
@@ -151,6 +310,18 @@ declare nonnull noalias i8* @_Znwm(i64)
 @_ZSt7nothrow = external global %"struct.std::nothrow_t"
 
 define i8* @operator_new_nothrow_t() {
+; CHECK-LABEL: @operator_new_nothrow_t(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call noalias i8* @_ZnamRKSt9nothrow_t(i64 8, %"struct.std::nothrow_t"* @_ZSt7nothrow)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8* [[CALL]], null
+; CHECK-NEXT:    br i1 [[CMP]], label [[CAST_END:%.*]], label [[CAST_NOTNULL:%.*]]
+; CHECK:       cast.notnull:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 4
+; CHECK-NEXT:    br label [[CAST_END]]
+; CHECK:       cast.end:
+; CHECK-NEXT:    [[CAST_RESULT:%.*]] = phi i8* [ [[ADD_PTR]], [[CAST_NOTNULL]] ], [ null, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8* [[CAST_RESULT]]
+;
 entry:
   %call = tail call noalias i8* @_ZnamRKSt9nothrow_t(i64 8, %"struct.std::nothrow_t"* @_ZSt7nothrow)
   %cmp = icmp eq i8* %call, null
@@ -164,13 +335,23 @@ cast.end:                                         ; preds = %cast.notnull, %entr
   %cast.result = phi i8* [ %add.ptr, %cast.notnull ], [ null, %entry ]
   ret i8* %cast.result
 
-; CHECK-LABEL: @operator_new_nothrow_t
-; CHECK: br i1 %cmp, label %cast.end, label %cast.notnull
 }
 
 declare i8* @_ZnamRKSt9nothrow_t(i64, %"struct.std::nothrow_t"*) nounwind
 
 define i8* @malloc_can_return_null() {
+; CHECK-LABEL: @malloc_can_return_null(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call noalias i8* @malloc(i64 8)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8* [[CALL]], null
+; CHECK-NEXT:    br i1 [[CMP]], label [[CAST_END:%.*]], label [[CAST_NOTNULL:%.*]]
+; CHECK:       cast.notnull:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 4
+; CHECK-NEXT:    br label [[CAST_END]]
+; CHECK:       cast.end:
+; CHECK-NEXT:    [[CAST_RESULT:%.*]] = phi i8* [ [[ADD_PTR]], [[CAST_NOTNULL]] ], [ null, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8* [[CAST_RESULT]]
+;
 entry:
   %call = tail call noalias i8* @malloc(i64 8)
   %cmp = icmp eq i8* %call, null
@@ -184,38 +365,44 @@ cast.end:                                         ; preds = %cast.notnull, %entr
   %cast.result = phi i8* [ %add.ptr, %cast.notnull ], [ null, %entry ]
   ret i8* %cast.result
 
-; CHECK-LABEL: @malloc_can_return_null
-; CHECK: br i1 %cmp, label %cast.end, label %cast.notnull
 }
 
 define i32 @call_null() {
+; CHECK-LABEL: @call_null(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 null()
+; CHECK-NEXT:    ret i32 undef
+;
 entry:
   %call = call i32 null()
   ret i32 %call
 }
-; CHECK-LABEL: define i32 @call_null(
-; CHECK: ret i32 undef
 
 define i32 @call_undef() {
+; CHECK-LABEL: @call_undef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 undef()
+; CHECK-NEXT:    ret i32 undef
+;
 entry:
   %call = call i32 undef()
   ret i32 %call
 }
-; CHECK-LABEL: define i32 @call_undef(
-; CHECK: ret i32 undef
 
 @GV = private constant [8 x i32] [i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49]
 
 define <8 x i32> @partial_masked_load() {
 ; CHECK-LABEL: @partial_masked_load(
-; CHECK:         ret <8 x i32> <i32 undef, i32 undef, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; CHECK-NEXT:    ret <8 x i32> <i32 undef, i32 undef, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+;
   %masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast (i32* getelementptr ([8 x i32], [8 x i32]* @GV, i64 0, i64 -2) to <8 x i32>*), i32 4, <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
   ret <8 x i32> %masked.load
 }
 
 define <8 x i32> @masked_load_undef_mask(<8 x i32>* %V) {
 ; CHECK-LABEL: @masked_load_undef_mask(
-; CHECK:         ret <8 x i32> <i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> <i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0>
+;
   %masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %V, i32 4, <8 x i1> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0>)
   ret <8 x i32> %masked.load
 }
diff --git a/test/Transforms/InstSimplify/or.ll b/test/Transforms/InstSimplify/or.ll
index 2c5b6181bc6c..14b08af00646 100644
--- a/test/Transforms/InstSimplify/or.ll
+++ b/test/Transforms/InstSimplify/or.ll
@@ -159,7 +159,7 @@ define i399 @test4_apint(i399 %V, i399 %M) {
     %A = add i399 %V, %N
     %B = and i399 %A, %C1
     %D = and i399 %V, 274877906943
-    %R = or i399 %B, %D
+    %R = or i399 %D, %B
     ret i399 %R
 }
 
@@ -179,3 +179,42 @@ define i117 @test6_apint(i117 %X) {
     ret i117 %Y
 }
 
+; Test the case where integer BitWidth <= 64 && BitWidth % 2 != 0.
+; Vector version of test1_apint with the add commuted
+define <2 x i39> @test7_apint(<2 x i39> %V, <2 x i39> %M) {
+; CHECK-LABEL: @test7_apint(
+; CHECK-NEXT:    [[N:%.*]] = and <2 x i39> [[M:%.*]], <i39 -274877906944, i39 -274877906944>
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i39> [[N]], [[V:%.*]]
+; CHECK-NEXT:    ret <2 x i39> [[A]]
+;
+  ;; If we have: ((V + N) & C1) | (V & C2)
+  ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+  ;; replace with V+N.
+  %C1 = xor <2 x i39> <i39 274877906943, i39 274877906943>, <i39 -1, i39 -1> ;; C2 = 274877906943
+  %N = and <2 x i39> %M, <i39 274877906944, i39 274877906944>
+  %A = add <2 x i39> %N, %V
+  %B = and <2 x i39> %A, %C1
+  %D = and <2 x i39> %V, <i39 274877906943, i39 274877906943>
+  %R = or <2 x i39> %B, %D
+  ret <2 x i39> %R
+}
+
+; Test the case where Integer BitWidth > 64 && BitWidth <= 1024.
+; Vector version of test4_apint with the add and the or commuted
+define <2 x i399> @test8_apint(<2 x i399> %V, <2 x i399> %M) {
+; CHECK-LABEL: @test8_apint(
+; CHECK-NEXT:    [[N:%.*]] = and <2 x i399> [[M:%.*]], <i399 18446742974197923840, i399 18446742974197923840>
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i399> [[N]], [[V:%.*]]
+; CHECK-NEXT:    ret <2 x i399> [[A]]
+;
+  ;; If we have: ((V + N) & C1) | (V & C2)
+  ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+  ;; replace with V+N.
+  %C1 = xor <2 x i399> <i399 274877906943, i399 274877906943>, <i399 -1, i399 -1> ;; C2 = 274877906943
+  %N = and <2 x i399> %M, <i399 18446742974197923840, i399 18446742974197923840>
+  %A = add <2 x i399> %N, %V
+  %B = and <2 x i399> %A, %C1
+  %D = and <2 x i399> %V, <i399 274877906943, i399 274877906943>
+  %R = or <2 x i399> %D, %B
+  ret <2 x i399> %R
+}
diff --git a/test/Transforms/JumpThreading/assume.ll b/test/Transforms/JumpThreading/assume.ll
index 3a039676e172..f58ee299cba0 100644
--- a/test/Transforms/JumpThreading/assume.ll
+++ b/test/Transforms/JumpThreading/assume.ll
@@ -59,12 +59,12 @@ return:                                           ; preds = %entry, %if.then
 @g = external global i32
 
 ; Check that we do prove a fact using an assume within the block.
-; FIXME: We can fold the assume based on the semantics of assume.
-; CHECK-LABEL: @can_fold_assume
-; CHECK: %notnull = icmp ne i32* %array, null
-; CHECK-NEXT: call void @llvm.assume(i1 %notnull)
-; CHECK-NEXT: ret void
+; We can fold the assume based on the semantics of assume.
 define void @can_fold_assume(i32* %array) {
+; CHECK-LABEL: @can_fold_assume
+; CHECK-NOT: call void @llvm.assume
+; CHECK-NOT: br
+; CHECK: ret void
   %notnull = icmp ne i32* %array, null
   call void @llvm.assume(i1 %notnull)
   br i1 %notnull, label %normal, label %error
@@ -80,19 +80,128 @@ error:
 declare void @f(i1)
 declare void @exit()
 ; We can fold the assume but not the uses before the assume.
-define void @dont_fold_incorrectly(i32* %array) {
-; CHECK-LABEL:@dont_fold_incorrectly
+define void @cannot_fold_use_before_assume(i32* %array) {
+; CHECK-LABEL:@cannot_fold_use_before_assume
 ; CHECK: @f(i1 %notnull)
 ; CHECK-NEXT: exit()
-; CHECK-NEXT: assume(i1 %notnull)
+; CHECK-NOT: assume
+; CHECK-NEXT: ret void
+  %notnull = icmp ne i32* %array, null
+  call void @f(i1 %notnull)
+  call void @exit()
+  call void @llvm.assume(i1 %notnull)
+  br i1 %notnull, label %normal, label %error
+
+normal:
+  ret void
+
+error:
+  store atomic i32 0, i32* @g unordered, align 4
+  ret void
+}
+
+declare void @dummy(i1) nounwind argmemonly
+define void @can_fold_some_use_before_assume(i32* %array) {
+
+; CHECK-LABEL:@can_fold_some_use_before_assume
+; CHECK: @f(i1 %notnull)
+; CHECK-NEXT: @dummy(i1 true)
+; CHECK-NOT: assume
 ; CHECK-NEXT: ret void
   %notnull = icmp ne i32* %array, null
   call void @f(i1 %notnull)
+  call void @dummy(i1 %notnull)
+  call void @llvm.assume(i1 %notnull)
+  br i1 %notnull, label %normal, label %error
+
+normal:
+  ret void
+
+error:
+  store atomic i32 0, i32* @g unordered, align 4
+  ret void
+
+}
+
+; FIXME: can fold assume and all uses before/after assume.
+; because the trapping exit call is after the assume.
+define void @can_fold_assume_and_all_uses(i32* %array) {
+; CHECK-LABEL:@can_fold_assume_and_all_uses
+; CHECK: @dummy(i1 %notnull)
+; CHECK-NEXT: assume(i1 %notnull)
+; CHECK-NEXT: exit()
+; CHECK-NEXT: %notnull2 = or i1 true, false
+; CHECK-NEXT: @f(i1 %notnull2)
+; CHECK-NEXT: ret void
+  %notnull = icmp ne i32* %array, null
+  call void @dummy(i1 %notnull)
+  call void @llvm.assume(i1 %notnull)
   call void @exit()
+  br i1 %notnull, label %normal, label %error
+
+normal:
+  %notnull2 = or i1 %notnull, false
+  call void @f(i1 %notnull2)
+  ret void
+
+error:
+  store atomic i32 0, i32* @g unordered, align 4
+  ret void
+}
+
+declare void @fz(i8)
+; FIXME: We can fold assume to true, and the use after assume, but we do not do so
+; currently, because of the function call after the assume.
+define void @can_fold_assume2(i32* %array) {
+
+; CHECK-LABEL:@can_fold_assume2
+; CHECK: @f(i1 %notnull)
+; CHECK-NEXT: assume(i1 %notnull)
+; CHECK-NEXT: znotnull = zext i1 %notnull to i8
+; CHECK-NEXT: @f(i1 %notnull)
+; CHECK-NEXT: @f(i1 true)
+; CHECK-NEXT: @fz(i8 %znotnull)
+; CHECK-NEXT: ret void
+  %notnull = icmp ne i32* %array, null
+  call void @f(i1 %notnull)
+  call void @llvm.assume(i1 %notnull)
+  %znotnull = zext i1 %notnull to i8
+  call void @f(i1 %notnull)
+  br i1 %notnull, label %normal, label %error
+
+normal:
+  call void @f(i1 %notnull)
+  call void @fz(i8 %znotnull)
+  ret void
+
+error:
+  store atomic i32 0, i32* @g unordered, align 4
+  ret void
+}
+
+declare void @llvm.experimental.guard(i1, ...)
+; FIXME: We can fold assume to true, but we do not do so
+; because of the guard following the assume.
+define void @can_fold_assume3(i32* %array){
+
+; CHECK-LABEL:@can_fold_assume3
+; CHECK: @f(i1 %notnull)
+; CHECK-NEXT: assume(i1 %notnull)
+; CHECK-NEXT: guard(i1 %notnull)
+; CHECK-NEXT: znotnull = zext i1 true to i8
+; CHECK-NEXT: @f(i1 true)
+; CHECK-NEXT: @fz(i8 %znotnull)
+; CHECK-NEXT: ret void
+  %notnull = icmp ne i32* %array, null
+  call void @f(i1 %notnull)
   call void @llvm.assume(i1 %notnull)
+  call void(i1, ...) @llvm.experimental.guard(i1 %notnull) [ "deopt"() ]
+  %znotnull = zext i1 %notnull to i8
   br i1 %notnull, label %normal, label %error
 
 normal:
+  call void @f(i1 %notnull)
+  call void @fz(i8 %znotnull)
   ret void
 
 error:
@@ -100,6 +209,26 @@ error:
   ret void
 }
 
+
+; can fold all uses and remove the cond
+define void @can_fold_assume4(i32* %array) {
+; CHECK-LABEL: can_fold_assume4
+; CHECK-NOT: notnull
+; CHECK: dummy(i1 true)
+; CHECK-NEXT: ret void
+  %notnull = icmp ne i32* %array, null
+  call void @exit()
+  call void @dummy(i1 %notnull)
+  call void @llvm.assume(i1 %notnull)
+  br i1 %notnull, label %normal, label %error
+
+normal:
+  ret void
+
+error:
+  store atomic i32 0, i32* @g unordered, align 4
+  ret void
+}
 ; Function Attrs: nounwind
 declare void @llvm.assume(i1) #1
 
diff --git a/test/Transforms/JumpThreading/fold-not-thread.ll b/test/Transforms/JumpThreading/fold-not-thread.ll
index f05169b31bc8..85cdcc0d9b33 100644
--- a/test/Transforms/JumpThreading/fold-not-thread.ll
+++ b/test/Transforms/JumpThreading/fold-not-thread.ll
@@ -133,10 +133,10 @@ L3:
   ret void
 }
 
-; FIXME: Make sure we can do the RAUW for %add...
+; Make sure we can do the RAUW for %add...
 ;
 ; CHECK-LABEL: @rauw_if_possible(
-; CHECK: call void @f4(i32 %add)
+; CHECK: call void @f4(i32 96)
 define void @rauw_if_possible(i32 %value) nounwind {
 entry:
   %cmp = icmp eq i32 %value, 32
diff --git a/test/Transforms/JumpThreading/guards.ll b/test/Transforms/JumpThreading/guards.ll
index c5f72b113efc..53175a7b7253 100644
--- a/test/Transforms/JumpThreading/guards.ll
+++ b/test/Transforms/JumpThreading/guards.ll
@@ -182,86 +182,89 @@ Exit:
   ret void
 }
 
-declare void @never_called()
+declare void @never_called(i1)
 
-; Assume the guard is always taken and we deoptimize, so we never reach the
-; branch below that guard. We should *never* change the behaviour of a guard from
-; `must deoptimize` to `may deoptimize`, since this affects the program
-; semantics.
+; LVI uses guard to identify value of %c2 in branch as true, we cannot replace that
+; guard with guard(true & c1).
 define void @dont_fold_guard(i8* %addr, i32 %i, i32 %length) {
 ; CHECK-LABEL: dont_fold_guard
-; CHECK: experimental.guard(i1 %wide.chk)
-
-entry:
-  br label %BBPred
+; CHECK: %wide.chk = and i1 %c1, %c2
+; CHECK-NEXT: experimental.guard(i1 %wide.chk)
+; CHECK-NEXT: call void @never_called(i1 true)
+; CHECK-NEXT: ret void
+  %c1 = icmp ult i32 %i, %length
+  %c2 = icmp eq i32 %i, 0
+  %wide.chk = and i1 %c1, %c2
+  call void(i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
+  br i1 %c2, label %BB1, label %BB2
 
-BBPred:
- %cond = icmp eq i8* %addr, null
- br i1 %cond, label %zero, label %not_zero
+BB1:
+  call void @never_called(i1 %c2)
+  ret void
 
-zero:
-  unreachable
+BB2:
+  ret void
+}
 
-not_zero:
+declare void @dummy(i1) nounwind argmemonly
+; same as dont_fold_guard1 but there's a use immediately after guard and before
+; branch. We can fold that use.
+define void @dont_fold_guard2(i8* %addr, i32 %i, i32 %length) {
+; CHECK-LABEL: dont_fold_guard2
+; CHECK: %wide.chk = and i1 %c1, %c2
+; CHECK-NEXT: experimental.guard(i1 %wide.chk)
+; CHECK-NEXT: dummy(i1 true)
+; CHECK-NEXT: call void @never_called(i1 true)
+; CHECK-NEXT: ret void
   %c1 = icmp ult i32 %i, %length
   %c2 = icmp eq i32 %i, 0
   %wide.chk = and i1 %c1, %c2
   call void(i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
-  br i1 %c2, label %unreachedBB2, label %unreachedBB1
+  call void @dummy(i1 %c2)
+  br i1 %c2, label %BB1, label %BB2
 
-unreachedBB2:
-  call void @never_called()
+BB1:
+  call void @never_called(i1 %c2)
   ret void
 
-unreachedBB1:
+BB2:
   ret void
 }
 
-
 ; same as dont_fold_guard1 but condition %cmp is not an instruction.
 ; We cannot fold the guard under any circumstance.
 ; FIXME: We can merge unreachableBB2 into not_zero.
-define void @dont_fold_guard2(i8* %addr, i1 %cmp, i32 %i, i32 %length) {
-; CHECK-LABEL: dont_fold_guard2
+define void @dont_fold_guard3(i8* %addr, i1 %cmp, i32 %i, i32 %length) {
+; CHECK-LABEL: dont_fold_guard3
 ; CHECK: guard(i1 %cmp)
-
-entry:
-  br label %BBPred
-
-BBPred:
- %cond = icmp eq i8* %addr, null
- br i1 %cond, label %zero, label %not_zero
-
-zero:
-  unreachable
-
-not_zero:
   call void(i1, ...) @llvm.experimental.guard(i1 %cmp) [ "deopt"() ]
-  br i1 %cmp, label %unreachedBB2, label %unreachedBB1
+  br i1 %cmp, label %BB1, label %BB2
 
-unreachedBB2:
-  call void @never_called()
+BB1:
+  call void @never_called(i1 %cmp)
   ret void
 
-unreachedBB1:
+BB2:
   ret void
 }
 
+declare void @f(i1)
 ; Same as dont_fold_guard1 but use switch instead of branch.
 ; triggers source code `ProcessThreadableEdges`.
-declare void @f(i1)
-define void @dont_fold_guard3(i1 %cmp1, i32 %i) nounwind {
-; CHECK-LABEL: dont_fold_guard3 
+define void @dont_fold_guard4(i1 %cmp1, i32 %i) nounwind {
+; CHECK-LABEL: dont_fold_guard4 
 ; CHECK-LABEL: L2:
 ; CHECK-NEXT: %cmp = icmp eq i32 %i, 0 
 ; CHECK-NEXT: guard(i1 %cmp)
-; CHECK-NEXT: @f(i1 %cmp)
+; CHECK-NEXT: dummy(i1 true)
+; CHECK-NEXT: @f(i1 true)
 ; CHECK-NEXT: ret void
 entry:
   br i1 %cmp1, label %L0, label %L3 
 L0:
   %cmp = icmp eq i32 %i, 0
   call void(i1, ...) @llvm.experimental.guard(i1 %cmp) [ "deopt"() ]
+  call void @dummy(i1 %cmp)
   switch i1 %cmp, label %L3 [
     i1 false, label %L1
     i1 true, label %L2
diff --git a/test/Transforms/LoopIdiom/pr33114.ll b/test/Transforms/LoopIdiom/pr33114.ll
new file mode 100644
index 000000000000..fa44d8e31e7c
--- /dev/null
+++ b/test/Transforms/LoopIdiom/pr33114.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Check that we're not crashing while looking at the recurrence variable.
+; RUN: opt -S -loop-idiom %s | FileCheck %s
+
+define void @tinkywinky() {
+; CHECK-LABEL: @tinkywinky(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[PH:%.*]]
+; CHECK:       ph:
+; CHECK-NEXT:    [[MYPHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[PATATINO:%.*]] = ashr i32 [[MYPHI]], undef
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[PATATINO]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[EXIT_LOOPEXIT:%.*]], label [[IF_END]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 true, label %exit, label %ph
+
+ph:
+  %myphi = phi i32 [ 1, %entry ]
+  br label %if.end
+
+if.end:
+  %patatino = ashr i32 %myphi, undef
+  %tobool = icmp eq i32 %patatino, 0
+  br i1 %tobool, label %exit, label %if.end
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/incorrect-offset-scaling.ll b/test/Transforms/LoopStrengthReduce/X86/incorrect-offset-scaling.ll
index 3adb8bcf514d..00c3222b0051 100644
--- a/test/Transforms/LoopStrengthReduce/X86/incorrect-offset-scaling.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/incorrect-offset-scaling.ll
@@ -25,7 +25,7 @@ L2:                                               ; preds = %idxend.8
 if6:                                              ; preds = %idxend.8
   %r2 = add i64 %0, -1
   %r3 = load i64, i64* %1, align 8
-; CHECK-NOT:  %r2
+; CHECK:  %r2 = add i64 %0, -1
 ; CHECK:  %r3 = load i64
   br label %ib
 
@@ -36,13 +36,11 @@ ib:                                               ; preds = %if6
   %r4 = mul i64 %r3, %r0
   %r5 = add i64 %r2, %r4
   %r6 = icmp ult i64 %r5, undef
-; CHECK:  [[MUL1:%[0-9]+]] = mul i64 %lsr.iv, %r3
-; CHECK:  [[ADD1:%[0-9]+]] = add i64 [[MUL1]], -1
-; CHECK:  add i64 %{{.}}, [[ADD1]]
-; CHECK:  %r6
+; CHECK:  %r4 = mul i64 %r3, %lsr.iv
+; CHECK:  %r5 = add i64 %r2, %r4
+; CHECK:  %r6 = icmp ult i64 %r5, undef
+; CHECK:  %r7 = getelementptr i64, i64* undef, i64 %r5
   %r7 = getelementptr i64, i64* undef, i64 %r5
   store i64 1, i64* %r7, align 8
-; CHECK:  [[MUL2:%[0-9]+]] = mul i64 %lsr.iv, %r3
-; CHECK:  [[ADD2:%[0-9]+]] = add i64 [[MUL2]], -1
   br label %L
 }
diff --git a/test/Transforms/LoopStrengthReduce/lsr-expand-quadratic.ll b/test/Transforms/LoopStrengthReduce/X86/lsr-expand-quadratic.ll
index aa688d999e60..a7731bfcec56 100644
--- a/test/Transforms/LoopStrengthReduce/lsr-expand-quadratic.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/lsr-expand-quadratic.ll
@@ -1,5 +1,14 @@
+; REQUIRES: x86
 ; RUN: opt -loop-reduce -S < %s | FileCheck %s
 
+; Strength reduction analysis here relies on IV Users analysis, that
+; only finds users among instructions with types that are treated as
+; legal by the data layout. When running this test on pure non-x86
+; configs (for example, ARM 64), it gets confused with the target
+; triple and uses a default data layout instead. This default layout
+; does not have any legal types (even i32), so the transformation
+; does not happen.
+
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx"
 
@@ -7,16 +16,23 @@ target triple = "x86_64-apple-macosx"
 ;
 ; SCEV expander cannot expand quadratic recurrences outside of the
 ; loop. This recurrence depends on %sub.us, so can't be expanded.
+; We cannot fold SCEVUnknown (sub.us) with recurrences since it is
+; declared after the loop.
 ;
 ; CHECK-LABEL: @test2
 ; CHECK-LABEL: test2.loop:
-; CHECK: %lsr.iv = phi i32 [ %lsr.iv.next, %test2.loop ], [ -16777216, %entry ]
-; CHECK: %lsr.iv.next = add nsw i32 %lsr.iv, 16777216
+; CHECK:  %lsr.iv1 = phi i32 [ %lsr.iv.next2, %test2.loop ], [ -16777216, %entry ]
+; CHECK:  %lsr.iv = phi i32 [ %lsr.iv.next, %test2.loop ], [ -1, %entry ]
+; CHECK:  %lsr.iv.next = add nsw i32 %lsr.iv, 1
+; CHECK:  %lsr.iv.next2 = add nsw i32 %lsr.iv1, 16777216
 ;
 ; CHECK-LABEL: for.end:
-; CHECK: %sub.cond.us = sub nsw i32 %inc1115.us, %sub.us
-; CHECK: %sext.us = mul i32 %lsr.iv.next, %sub.cond.us
-; CHECK: %f = ashr i32 %sext.us, 24
+; CHECK:  %tobool.us = icmp eq i32 %lsr.iv.next2, 0
+; CHECK:  %sub.us = select i1 %tobool.us, i32 0, i32 0
+; CHECK:  %1 = sub i32 0, %sub.us
+; CHECK:  %2 = add i32 %1, %lsr.iv.next
+; CHECK:  %sext.us = mul i32 %lsr.iv.next2, %2
+; CHECK:  %f = ashr i32 %sext.us, 24
 ; CHECK: ret i32 %f
 define i32 @test2() {
 entry:
diff --git a/test/Transforms/LoopStrengthReduce/nonintegral.ll b/test/Transforms/LoopStrengthReduce/nonintegral.ll
new file mode 100644
index 000000000000..5648e3aa74af
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/nonintegral.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+
+; Address Space 10 is non-integral. The optimizer is not allowed to use
+; ptrtoint/inttoptr instructions. Make sure that this doesn't happen
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @japi1__unsafe_getindex_65028(i64 addrspace(10)* %arg) {
+; CHECK-NOT: inttoptr
+; CHECK-NOT: ptrtoint
+; How exactly SCEV chooses to materialize isn't all that important, as
+; long as it doesn't try to round-trip through integers. As of this writing,
+; it emits a byte-wise gep, which is fine.
+; CHECK: getelementptr i64, i64 addrspace(10)* {{.*}}, i64 {{.*}}
+top:
+  br label %L86
+
+L86:                                              ; preds = %L86, %top
+  %i.0 = phi i64 [ 0, %top ], [ %tmp, %L86 ]
+  %tmp = add i64 %i.0, 1
+  br i1 undef, label %L86, label %if29
+
+if29:                                             ; preds = %L86
+  %tmp1 = shl i64 %tmp, 1
+  %tmp2 = add i64 %tmp1, -2
+  br label %if31
+
+if31:                                             ; preds = %if38, %if29
+  %"#temp#1.sroa.0.022" = phi i64 [ 0, %if29 ], [ %tmp3, %if38 ]
+  br label %L119
+
+L119:                                             ; preds = %L119, %if31
+  %i5.0 = phi i64 [ %"#temp#1.sroa.0.022", %if31 ], [ %tmp3, %L119 ]
+  %tmp3 = add i64 %i5.0, 1
+  br i1 undef, label %L119, label %if38
+
+if38:                                             ; preds = %L119
+  %tmp4 = add i64 %tmp2, %i5.0
+  %tmp5 = getelementptr i64, i64 addrspace(10)* %arg, i64 %tmp4
+  %tmp6 = load i64, i64 addrspace(10)* %tmp5
+  br i1 undef, label %done, label %if31
+
+done:                                             ; preds = %if38
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll b/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
index fbf55fd81d23..cbf177c0d4b9 100644
--- a/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
+++ b/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
@@ -25,6 +25,8 @@ define void @_Z15IntegerToStringjjR7Vector2(i32 %i, i32 %radix, %struct.Vector2*
 entry:
   %buffer = alloca [33 x i16], align 16
   %add.ptr = getelementptr inbounds [33 x i16], [33 x i16]* %buffer, i64 0, i64 33
+  %sub.ptr.lhs.cast = ptrtoint i16* %add.ptr to i64
+  %sub.ptr.rhs.cast = ptrtoint i16* %add.ptr to i64
   br label %do.body
 
 do.body:                                          ; preds = %do.body, %entry
@@ -46,8 +48,6 @@ do.body:                                          ; preds = %do.body, %entry
 do.end:                                           ; preds = %do.body
   %xap.0 = inttoptr i64 %0 to i1*
   %cap.0 = ptrtoint i1* %xap.0 to i64
-  %sub.ptr.lhs.cast = ptrtoint i16* %add.ptr to i64
-  %sub.ptr.rhs.cast = ptrtoint i16* %incdec.ptr to i64
   %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
   %sub.ptr.div39 = lshr exact i64 %sub.ptr.sub, 1
   %conv11 = trunc i64 %sub.ptr.div39 to i32
diff --git a/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
new file mode 100644
index 000000000000..a7f414b8694b
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
@@ -0,0 +1,26 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: all_scalar
+; CHECK:       LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       LV: Not considering vector loop of width 2 because it will not generate any vector instructions
+;
+define void @all_scalar(i64* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr i64, i64* %a, i64 %i
+  store i64 0, i64* %tmp0, align 1
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/SystemZ/addressing.ll b/test/Transforms/LoopVectorize/SystemZ/addressing.ll
new file mode 100644
index 000000000000..1f7a6d29c57c
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/addressing.ll
@@ -0,0 +1,72 @@
+; RUN: opt -S  -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -dce \
+; RUN:   -instcombine -force-vector-width=2  < %s | FileCheck %s
+;
+; Test that loop vectorizer does not generate vector addresses that must then
+; always be extracted.
+
+; Check that the addresses for a scalarized memory access is not extracted
+; from a vector register.
+define i32 @foo(i32* nocapture %A) {
+;CHECK-LABEL: @foo(
+;CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK:  %0 = shl nsw i64 %index, 2
+;CHECK:  %1 = shl i64 %index, 2
+;CHECK:  %2 = or i64 %1, 4
+;CHECK:  %3 = getelementptr inbounds i32, i32* %A, i64 %0
+;CHECK:  %4 = getelementptr inbounds i32, i32* %A, i64 %2
+;CHECK:  store i32 4, i32* %3, align 4
+;CHECK:  store i32 4, i32* %4, align 4
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %0
+  store i32 4, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
+
+
+; Check that a load of address is scalarized.
+define i32 @foo1(i32* nocapture noalias %A, i32** nocapture %PtrPtr) {
+;CHECK-LABEL: @foo1(
+;CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK:  %0 = or i64 %index, 1
+;CHECK:  %1 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %index
+;CHECK:  %2 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %0
+;CHECK:  %3 = load i32*, i32** %1, align 8
+;CHECK:  %4 = load i32*, i32** %2, align 8
+;CHECK:  %5 = load i32, i32* %3, align 4
+;CHECK:  %6 = load i32, i32* %4, align 4
+;CHECK:  %7 = insertelement <2 x i32> undef, i32 %5, i32 0
+;CHECK:  %8 = insertelement <2 x i32> %7, i32 %6, i32 1
+;CHECK:  %9 = getelementptr inbounds i32, i32* %A, i64 %index
+;CHECK:  %10 = bitcast i32* %9 to <2 x i32>*
+;CHECK:  store <2 x i32> %8, <2 x i32>* %10, align 4
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr = getelementptr inbounds i32*, i32** %PtrPtr, i64 %indvars.iv
+  %el = load i32*, i32** %ptr
+  %v = load i32, i32* %el
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %v, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index b2933c4b56f2..4dc62d86453f 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -11,38 +11,38 @@
 ;       break;
 ;   }
 ; }
+; File, line, and column should match those specified in the metadata
+; CHECK: remark: source.cpp:4:5: loop not vectorized: could not determine number of loop iterations
+; CHECK: remark: source.cpp:4:5: loop not vectorized
 
 ; void test_disabled(int *A, int Length) {
 ; #pragma clang loop vectorize(disable) interleave(disable)
 ;   for (int i = 0; i < Length; i++)
 ;     A[i] = i;
 ; }
+; CHECK: remark: source.cpp:13:5: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1
 
 ; void test_array_bounds(int *A, int *B, int Length) {
 ; #pragma clang loop vectorize(enable)
 ;   for (int i = 0; i < Length; i++)
 ;     A[i] = A[B[i]];
 ; }
-
-; File, line, and column should match those specified in the metadata
-; CHECK: remark: source.cpp:4:5: loop not vectorized: could not determine number of loop iterations
-; CHECK: remark: source.cpp:4:5: loop not vectorized
-; CHECK: remark: source.cpp:13:5: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1
 ; CHECK: remark: source.cpp:19:5: loop not vectorized: cannot identify array bounds
 ; CHECK: remark: source.cpp:19:5: loop not vectorized
 ; CHECK: warning: source.cpp:19:5: loop not vectorized: failed explicitly specified loop vectorization
 
-; CHECK: _Z4testPii
-; CHECK-NOT: x i32>
-; CHECK: ret
-
-; CHECK: _Z13test_disabledPii
-; CHECK-NOT: x i32>
-; CHECK: ret
-
-; CHECK: _Z17test_array_boundsPiS_i
-; CHECK-NOT: x i32>
-; CHECK: ret
+; int foo();
+; void test_multiple_failures(int *A) {
+;   int k = 0;
+; #pragma clang loop vectorize(enable) interleave(enable)
+;   for (int i = 0; i < 1000; i+=A[i]) {
+;     if (A[i])
+;       k = foo();
+;   }
+;   return k;
+; }
+; CHECK: remark: source.cpp:29:7: loop not vectorized: control flow cannot be substituted for a select
+; CHECK: remark: source.cpp:27:3: loop not vectorized
 
 ; YAML:       --- !Analysis
 ; YAML-NEXT: Pass:            loop-vectorize
@@ -98,6 +98,41 @@
 ; YAML-NEXT:   - String:          'loop not vectorized: '
 ; YAML-NEXT:   - String:          failed explicitly specified loop vectorization
 ; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            NoCFGForSelect
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 29, Column: 7 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          control flow cannot be substituted for a select
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            NonReductionValueUsedOutsideLoop
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          value that could not be identified as reduction is used outside the loop
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            CantComputeNumberOfIterations
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          could not determine number of loop iterations
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT: ...
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -124,6 +159,10 @@ for.end:                                          ; preds = %for.body, %entry
   ret void, !dbg !24
 }
 
+; CHECK: _Z4testPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
 ; Function Attrs: nounwind optsize ssp uwtable
 define void @_Z13test_disabledPii(i32* nocapture %A, i32 %Length) #0 !dbg !7 {
 entry:
@@ -144,6 +183,10 @@ for.end:                                          ; preds = %for.body, %entry
   ret void, !dbg !31
 }
 
+; CHECK: _Z13test_disabledPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
 ; Function Attrs: nounwind optsize ssp uwtable
 define void @_Z17test_array_boundsPiS_i(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) #0 !dbg !8 {
 entry:
@@ -174,6 +217,45 @@ for.end:                                          ; preds = %for.end.loopexit, %
   ret void, !dbg !36
 }
 
+; CHECK: _Z17test_array_boundsPiS_i
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+; Function Attrs: nounwind uwtable
+define i32 @test_multiple_failures(i32* nocapture readonly %A) #0 !dbg !46 {
+entry:
+  br label %for.body, !dbg !38
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.09 = phi i32 [ 0, %entry ], [ %add, %for.inc ]
+  %k.09 = phi i32 [ 0, %entry ], [ %k.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.09, !dbg !40
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !40
+  %tobool = icmp eq i32 %0, 0, !dbg !40
+  br i1 %tobool, label %for.inc, label %if.then, !dbg !40
+
+if.then:                                          ; preds = %for.body
+  %call = tail call i32 (...) @foo(), !dbg !41
+  %.pre = load i32, i32* %arrayidx, align 4
+  br label %for.inc, !dbg !42
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %1 = phi i32 [ %.pre, %if.then ], [ 0, %for.body ], !dbg !43
+  %k.1 = phi i32 [ %call, %if.then ], [ %k.09, %for.body ]
+  %add = add nsw i32 %1, %i.09, !dbg !44
+  %cmp = icmp slt i32 %add, 1000, !dbg !45
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !38
+
+for.cond.cleanup:                                 ; preds = %for.inc
+  ret i32 %k.1, !dbg !39
+}
+
+declare i32 @foo(...)
+
+; CHECK: test_multiple_failure
+; CHECK-NOT: x i32>
+; CHECK: ret
+
 attributes #0 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
@@ -216,3 +298,13 @@ attributes #0 = { nounwind }
 !34 = !{!34, !15}
 !35 = !DILocation(line: 19, column: 5, scope: !33)
 !36 = !DILocation(line: 20, column: 1, scope: !8)
+!37 = distinct !DILexicalBlock(line: 24, column: 3, file: !1, scope: !46)
+!38 = !DILocation(line: 27, column: 3, scope: !37)
+!39 = !DILocation(line: 31, column: 3, scope: !37)
+!40 = !DILocation(line: 28, column: 9, scope: !37)
+!41 = !DILocation(line: 29, column: 11, scope: !37)
+!42 = !DILocation(line: 29, column: 7, scope: !37)
+!43 = !DILocation(line: 27, column: 32, scope: !37)
+!44 = !DILocation(line: 27, column: 30, scope: !37)
+!45 = !DILocation(line: 27, column: 21, scope: !37)
+!46 = distinct !DISubprogram(name: "test_multiple_failures", line: 26, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 26, file: !1, scope: !5, type: !6, variables: !2)
diff --git a/test/Transforms/NewGVN/pr32403.ll b/test/Transforms/NewGVN/pr32403.ll
index 2552e0e66ab9..505d31a9463e 100644
--- a/test/Transforms/NewGVN/pr32403.ll
+++ b/test/Transforms/NewGVN/pr32403.ll
@@ -17,7 +17,8 @@ define void @reorder_ref_pic_list() local_unnamed_addr {
 ; CHECK-NEXT:    [[INC_I:%.*]] = add nsw i32 [[REFIDXLX_0]], 1
 ; CHECK-NEXT:    br label [[FOR_BODY8_I:%.*]]
 ; CHECK:       for.body8.i:
-; CHECK-NEXT:    br i1 undef, label [[FOR_INC24_I:%.*]], label [[IF_THEN17_I:%.*]]
+; CHECK-NEXT:    [[NIDX_052_I:%.*]] = phi i32 [ [[INC_I]], [[IF_THEN13]] ], [ [[NIDX_052_I]], [[FOR_INC24_I:%.*]] ]
+; CHECK-NEXT:    br i1 undef, label [[FOR_INC24_I]], label [[IF_THEN17_I:%.*]]
 ; CHECK:       if.then17.i:
 ; CHECK-NEXT:    br label [[FOR_INC24_I]]
 ; CHECK:       for.inc24.i:
diff --git a/test/Transforms/NewGVN/pr32836.ll b/test/Transforms/NewGVN/pr32836.ll
new file mode 100644
index 000000000000..623f216101bf
--- /dev/null
+++ b/test/Transforms/NewGVN/pr32836.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -newgvn %s | FileCheck %s
+
+%struct.anon = type { i32 }
+@b = external global %struct.anon
+define void @tinkywinky(i1 %patatino) {
+; CHECK-LABEL: @tinkywinky(
+; CHECK-NEXT:    store i32 8, i32* null
+; CHECK-NEXT:    br i1 [[PATATINO:%.*]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[L:%.*]]
+; CHECK:       L:
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* null
+; CHECK-NEXT:    [[BF_LOAD1:%.*]] = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @b, i64 0, i32 0)
+; CHECK-NEXT:    [[BF_VALUE:%.*]] = and i32 [[TMP1]], 536870911
+; CHECK-NEXT:    [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD1]], -536870912
+; CHECK-NEXT:    [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], [[BF_VALUE]]
+; CHECK-NEXT:    store i32 [[BF_SET]], i32* getelementptr inbounds (%struct.anon, %struct.anon* @b, i64 0, i32 0)
+; CHECK-NEXT:    br label [[LOR_END:%.*]]
+; CHECK:       lor.end:
+; CHECK-NEXT:    br label [[L]]
+;
+  store i32 8, i32* null
+  br i1 %patatino, label %if.end, label %if.then
+if.then:
+  store i32 8, i32* null
+  br label %L
+L:
+  br label %if.end
+if.end:
+  %tmp1 = load i32, i32* null
+  %bf.load1 = load i32, i32* getelementptr (%struct.anon, %struct.anon* @b, i64 0, i32 0)
+  %bf.value = and i32 %tmp1, 536870911
+  %bf.clear = and i32 %bf.load1, -536870912
+  %bf.set = or i32 %bf.clear, %bf.value
+  store i32 %bf.set, i32* getelementptr (%struct.anon, %struct.anon* @b, i64 0, i32 0)
+  br label %lor.end
+lor.end:
+  %bf.load4 = load i32, i32* getelementptr (%struct.anon, %struct.anon* @b, i64 0, i32 0)
+  %tmp4 = and i32 %bf.load4, 536870911
+  %or = or i32 0, %tmp4
+  br label %L
+}
diff --git a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
index 3ac3c5138ae7..a97e3f81a8ef 100644
--- a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
+++ b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
@@ -382,3 +382,64 @@ loop_exit2:
 ; CHECK-NEXT:    %[[R:.*]] = add i32 %[[R1]], %[[R2]]
 ; CHECK-NEXT:    ret i32 %[[R]]
 }
+
+; This test, extracted from the LLVM test suite, has an interesting dominator
+; tree to update as there are edges to sibling domtree nodes within child
+; domtree nodes of the unswitched node.
+define void @xgets(i1 %cond1, i1* %cond2.ptr) {
+; CHECK-LABEL: @xgets(
+entry:
+  br label %for.cond.preheader
+; CHECK:       entry:
+; CHECK-NEXT:    br label %for.cond.preheader
+
+for.cond.preheader:
+  br label %for.cond
+; CHECK:       for.cond.preheader:
+; CHECK-NEXT:    br i1 %cond1, label %for.cond.preheader.split, label %if.end17.thread.loopexit
+;
+; CHECK:       for.cond.preheader.split:
+; CHECK-NEXT:    br label %for.cond
+
+for.cond:
+  br i1 %cond1, label %land.lhs.true, label %if.end17.thread.loopexit
+; CHECK:       for.cond:
+; CHECK-NEXT:    br label %land.lhs.true
+
+land.lhs.true:
+  br label %if.then20
+; CHECK:       land.lhs.true:
+; CHECK-NEXT:    br label %if.then20
+
+if.then20:
+  %cond2 = load volatile i1, i1* %cond2.ptr
+  br i1 %cond2, label %if.then23, label %if.else
+; CHECK:       if.then20:
+; CHECK-NEXT:    %[[COND2:.*]] = load volatile i1, i1* %cond2.ptr
+; CHECK-NEXT:    br i1 %[[COND2]], label %if.then23, label %if.else
+
+if.else:
+  br label %for.cond
+; CHECK:       if.else:
+; CHECK-NEXT:    br label %for.cond
+
+if.end17.thread.loopexit:
+  br label %if.end17.thread
+; CHECK:       if.end17.thread.loopexit:
+; CHECK-NEXT:    br label %if.end17.thread
+
+if.end17.thread:
+  br label %cleanup
+; CHECK:       if.end17.thread:
+; CHECK-NEXT:    br label %cleanup
+
+if.then23:
+  br label %cleanup
+; CHECK:       if.then23:
+; CHECK-NEXT:    br label %cleanup
+
+cleanup:
+  ret void
+; CHECK:       cleanup:
+; CHECK-NEXT:    ret void
+}
diff --git a/test/Verifier/fp-intrinsics.ll b/test/Verifier/fp-intrinsics.ll
index 0a308115cc35..cc3c3bc2ed88 100644
--- a/test/Verifier/fp-intrinsics.ll
+++ b/test/Verifier/fp-intrinsics.ll
@@ -1,13 +1,17 @@
 ; RUN: opt -verify -S < %s 2>&1 | FileCheck --check-prefix=CHECK1 %s
 ; RUN: sed -e s/.T2:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK2 %s
 ; RUN: sed -e s/.T3:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK3 %s
+; RUN: sed -e s/.T4:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK4 %s
+; RUN: sed -e s/.T5:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK5 %s
 
-; Common declaration used for all runs.
+; Common declarations used for all runs.
 declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata)
 
 ; Test that the verifier accepts legal code, and that the correct attributes are
 ; attached to the FP intrinsic.
 ; CHECK1: declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) #[[ATTR:[0-9]+]]
+; CHECK1: declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata) #[[ATTR]]
 ; CHECK1: attributes #[[ATTR]] = { inaccessiblememonly nounwind }
 ; Note: FP exceptions aren't usually caught through normal unwind mechanisms,
 ;       but we may want to revisit this for asynchronous exception handling.
@@ -20,6 +24,15 @@ entry:
   ret double %fadd
 }
 
+define double @f1u(double %a) {
+entry:
+  %fsqrt = call double @llvm.experimental.constrained.sqrt.f64(
+                                               double %a,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %fsqrt
+}
+
 ; Test an illegal value for the rounding mode argument.
 ; CHECK2: invalid rounding mode argument
 ;T2: define double @f2(double %a, double %b) {
@@ -33,7 +46,7 @@ entry:
 
 ; Test an illegal value for the exception behavior argument.
 ; CHECK3: invalid exception behavior argument
-;T3: define double @f2(double %a, double %b) {
+;T3: define double @f3(double %a, double %b) {
 ;T3: entry:
 ;T3:   %fadd = call double @llvm.experimental.constrained.fadd.f64(
 ;T3:                                         double %a, double %b,
@@ -41,3 +54,25 @@ entry:
 ;T3:                                         metadata !"fpexcept.restrict")
 ;T3:   ret double %fadd
 ;T3: }
+
+; Test an illegal value for the rounding mode argument.
+; CHECK4: invalid rounding mode argument
+;T4: define double @f4(double %a) {
+;T4: entry:
+;T4:   %fadd = call double @llvm.experimental.constrained.sqrt.f64(
+;T4:                                           double %a,
+;T4:                                           metadata !"round.dynomite",
+;T4:                                           metadata !"fpexcept.strict")
+;T4:   ret double %fadd
+;T4: }
+
+; Test an illegal value for the exception behavior argument.
+; CHECK5: invalid exception behavior argument
+;T5: define double @f5(double %a) {
+;T5: entry:
+;T5:   %fadd = call double @llvm.experimental.constrained.sqrt.f64(
+;T5:                                         double %a,
+;T5:                                         metadata !"round.dynamic",
+;T5:                                         metadata !"fpexcept.restrict")
+;T5:   ret double %fadd
+;T5: }
diff --git a/test/Verifier/module-flags-1.ll b/test/Verifier/module-flags-1.ll
index 36bcb335ffc2..ff82c2845235 100644
--- a/test/Verifier/module-flags-1.ll
+++ b/test/Verifier/module-flags-1.ll
@@ -41,6 +41,10 @@
 ; CHECK-NOT: invalid value for 'append'-type module flag (expected a metadata node)
 !18 = !{i32 5, !"flag-4", !{i32 57}}
 
+; Check that any 'max' module flags are valid.
+; CHECK: invalid value for 'max' module flag (expected constant integer)
+!19 = !{i32 7, !"max", !"max"}
+
 ; Check that any 'require' module flags are valid.
 ; CHECK: invalid requirement on flag, flag is not present in module
 !11 = !{i32 3, !"bar", !{!"no-such-flag", i32 52}}
@@ -54,4 +58,4 @@
 
 !llvm.module.flags = !{
   !0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15,
-  !16, !17, !18 }
+  !16, !17, !18, !19 }
diff --git a/test/tools/gold/X86/relocation-model-pic.ll b/test/tools/gold/X86/relocation-model-pic.ll
new file mode 100644
index 000000000000..65b7beecc22d
--- /dev/null
+++ b/test/tools/gold/X86/relocation-model-pic.ll
@@ -0,0 +1,63 @@
+; RUN: cat %s >%t.pic.ll
+; RUN: echo '!llvm.module.flags = !{!0}' >>%t.pic.ll
+; RUN: echo '!0 = !{i32 1, !"PIC Level", i32 2}' >>%t.pic.ll
+
+; RUN: llvm-as %s -o %t.o
+; RUN: llvm-as %t.pic.ll -o %t.pic.o
+
+;; Non-PIC source.
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --shared \
+; RUN:    --plugin-opt=save-temps %t.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --export-dynamic --noinhibit-exec -pie \
+; RUN:    --plugin-opt=save-temps %t.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --export-dynamic --noinhibit-exec \
+; RUN:    --plugin-opt=save-temps %t.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=STATIC
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    -r \
+; RUN:    --plugin-opt=save-temps %t.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=STATIC
+
+;; PIC source.
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --shared \
+; RUN:    --plugin-opt=save-temps %t.pic.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --export-dynamic --noinhibit-exec -pie \
+; RUN:    --plugin-opt=save-temps %t.pic.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --export-dynamic --noinhibit-exec \
+; RUN:    --plugin-opt=save-temps %t.pic.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=STATIC
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    -r \
+; RUN:    --plugin-opt=save-temps %t.pic.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+
+
+; PIC: R_X86_64_GOTPCREL foo
+; STATIC: R_X86_64_PC32 foo
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@foo = external global i32
+define i32 @main() {
+  %t = load i32, i32* @foo
+  ret i32 %t
+}
diff --git a/test/tools/llvm-nm/X86/Inputs/example.lib b/test/tools/llvm-nm/X86/Inputs/example.lib
new file mode 100644
index 000000000000..edcd888f2ba9
--- /dev/null
+++ b/test/tools/llvm-nm/X86/Inputs/example.lib
diff --git a/test/tools/llvm-nm/X86/importlibrary.test b/test/tools/llvm-nm/X86/importlibrary.test
new file mode 100644
index 000000000000..9111694c2c6f
--- /dev/null
+++ b/test/tools/llvm-nm/X86/importlibrary.test
@@ -0,0 +1,7 @@
+# RUN: llvm-nm -B %S/Inputs/example.lib | FileCheck --match-full-lines %s
+
+CHECK: 00000000 R __imp__constant
+CHECK: 00000000 R _constant
+CHECK: 00000000 D __imp__data
+CHECK: 00000000 T __imp__function
+CHECK: 00000000 T _function
diff --git a/test/tools/llvm-profdata/memop-size-prof.proftext b/test/tools/llvm-profdata/memop-size-prof.proftext
index 882fc1ecf296..79dc0f8a1632 100644
--- a/test/tools/llvm-profdata/memop-size-prof.proftext
+++ b/test/tools/llvm-profdata/memop-size-prof.proftext
@@ -67,7 +67,7 @@ ic2:20000
 7:33
 8:22
 
-#MEMOP: Memory Instrinsic Size Results:
+#MEMOP: Memory Intrinsic Size Results:
 #MEMOP-NEXT:  [ 0, 1, 99 ]
 #MEMOP-NEXT:  [ 0, 2, 88 ]
 #MEMOP-NEXT:  [ 0, 3, 77 ]