aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2018-01-24 20:23:48 +0000
committerDimitry Andric <dim@FreeBSD.org>2018-01-24 20:23:48 +0000
commita096e0bdf6cfa020569afca490d8e4c9ac8ebb01 (patch)
tree39ef21ba905e021d44b9a5fb47336d4a864da27e
parentd215fd3b74b90f5dc1964610926fcc2a20f959aa (diff)
Vendor import of llvm release_60 branch r323338:vendor/llvm/llvm-release_60-r323338
Notes
Notes: svn path=/vendor/llvm/dist-release_60/; revision=328362 svn path=/vendor/llvm/llvm-release_60-r323338/; revision=328363; tag=vendor/llvm/llvm-release_60-r323338
-rw-r--r--cmake/modules/LLVMConfig.cmake.in2
-rw-r--r--docs/ReleaseNotes.rst58
-rw-r--r--include/llvm/Analysis/RegionInfoImpl.h12
-rw-r--r--include/llvm/CodeGen/SelectionDAGAddressAnalysis.h2
-rw-r--r--include/llvm/MC/MCCodeView.h48
-rw-r--r--include/llvm/Support/GenericDomTreeConstruction.h66
-rw-r--r--include/llvm/Transforms/Vectorize/SLPVectorizer.h7
-rw-r--r--lib/CodeGen/CodeGenPrepare.cpp7
-rw-r--r--lib/CodeGen/GlobalMerge.cpp3
-rw-r--r--lib/CodeGen/PeepholeOptimizer.cpp41
-rw-r--r--lib/CodeGen/SelectionDAG/DAGCombiner.cpp120
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeDAG.cpp4
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAG.cpp7
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp21
-rw-r--r--lib/CodeGen/TargetLoweringBase.cpp15
-rw-r--r--lib/Linker/IRMover.cpp7
-rw-r--r--lib/MC/MCCodeView.cpp69
-rw-r--r--lib/Target/AArch64/AArch64InstructionSelector.cpp34
-rw-r--r--lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp20
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp43
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.h6
-rw-r--r--lib/Target/PowerPC/PPCInstrInfo.td12
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp7
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp62
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp3
-rw-r--r--lib/Transforms/Scalar/GVNHoist.cpp2
-rw-r--r--lib/Transforms/Scalar/StructurizeCFG.cpp110
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp9
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp61
-rw-r--r--test/CodeGen/AArch64/GlobalISel/select-gv-cmodel-large.mir61
-rw-r--r--test/CodeGen/AArch64/atomic-ops-lse.ll43
-rw-r--r--test/CodeGen/AMDGPU/multilevel-break.ll3
-rw-r--r--test/CodeGen/AMDGPU/nested-loop-conditions.ll127
-rw-r--r--test/CodeGen/ARM/and-load-combine.ll14
-rw-r--r--test/CodeGen/ARM/atomic-cmpxchg.ll3
-rw-r--r--test/CodeGen/ARM/cmpxchg-O0.ll6
-rw-r--r--test/CodeGen/ARM/global-merge-dllexport.ll15
-rw-r--r--test/CodeGen/ARM/global-merge-external.ll29
-rw-r--r--test/CodeGen/ARM/peephole-phi.mir67
-rw-r--r--test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll94
-rw-r--r--test/CodeGen/PowerPC/atomics-regression.ll40
-rw-r--r--test/CodeGen/X86/avx512-shuffles/partial_permute.ll39
-rw-r--r--test/CodeGen/X86/darwin-bzero.ll9
-rw-r--r--test/CodeGen/X86/inline-asm-A-constraint.ll3
-rw-r--r--test/CodeGen/X86/pr35761.ll36
-rw-r--r--test/CodeGen/X86/pr35972.ll20
-rw-r--r--test/CodeGen/X86/pr37563.ll42
-rw-r--r--test/CodeGen/X86/var-permute-128.ll5
-rw-r--r--test/CodeGen/X86/var-permute-256.ll180
-rw-r--r--test/MC/COFF/cv-inline-linetable.s26
-rw-r--r--test/MC/X86/x86-64.s38
-rw-r--r--test/ThinLTO/X86/Inputs/dicompositetype-unique2.ll46
-rw-r--r--test/ThinLTO/X86/dicompositetype-unique2.ll69
-rw-r--r--test/Transforms/CodeGenPrepare/X86/sink-addrmode-select.ll19
-rw-r--r--test/Transforms/GVNHoist/pr35222-hoist-load.ll45
-rw-r--r--test/Transforms/JumpThreading/ddt-crash3.ll43
-rw-r--r--test/Transforms/JumpThreading/ddt-crash4.ll75
-rw-r--r--test/Transforms/LoopVectorize/pr35773.ll53
-rw-r--r--test/Transforms/SLPVectorizer/X86/PR35628_1.ll74
-rw-r--r--test/Transforms/SLPVectorizer/X86/PR35628_2.ll64
-rw-r--r--test/Transforms/SLPVectorizer/X86/PR35777.ll48
-rw-r--r--test/Transforms/SLPVectorizer/X86/PR35865.ll27
-rw-r--r--test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll220
-rw-r--r--test/Transforms/SLPVectorizer/X86/insertvalue.ll162
-rw-r--r--test/Transforms/SLPVectorizer/X86/value-bug.ll48
-rw-r--r--test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug-xfail.ll77
-rw-r--r--test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll163
-rw-r--r--test/Transforms/StructurizeCFG/AMDGPU/lit.local.cfg2
-rw-r--r--test/Transforms/StructurizeCFG/nested-loop-order.ll83
-rw-r--r--test/tools/llvm-readobj/macho-needed-libs.test26
-rw-r--r--tools/llvm-readobj/MachODumper.cpp30
-rw-r--r--unittests/IR/DominatorTreeBatchUpdatesTest.cpp95
-rw-r--r--unittests/IR/DominatorTreeTest.cpp25
-rwxr-xr-xutils/release/test-release.sh10
74 files changed, 2671 insertions, 591 deletions
diff --git a/cmake/modules/LLVMConfig.cmake.in b/cmake/modules/LLVMConfig.cmake.in
index 077201691656..fe4df5278498 100644
--- a/cmake/modules/LLVMConfig.cmake.in
+++ b/cmake/modules/LLVMConfig.cmake.in
@@ -37,6 +37,8 @@ set(LLVM_ENABLE_THREADS @LLVM_ENABLE_THREADS@)
set(LLVM_ENABLE_ZLIB @LLVM_ENABLE_ZLIB@)
+set(LLVM_LIBXML2_ENABLED @LLVM_LIBXML2_ENABLED@)
+
set(LLVM_ENABLE_DIA_SDK @LLVM_ENABLE_DIA_SDK@)
set(LLVM_NATIVE_ARCH @LLVM_NATIVE_ARCH@)
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 41b9cf92d767..8ef9f6b86c51 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -54,6 +54,8 @@ Non-comprehensive list of changes in this release
``DIVariables`` to the instructions in a ``Module``. The ``CheckDebugify``
pass determines how much of the metadata is lost.
+* Significantly improved quality of CodeView debug info for Windows.
+
* Note..
.. NOTE
@@ -69,10 +71,13 @@ Non-comprehensive list of changes in this release
Changes to the LLVM IR
----------------------
-Changes to the ARM Backend
---------------------------
+Changes to the ARM Target
+-------------------------
- During this release ...
+During this release the ARM target has:
+
+* Got support for enabling SjLj exception handling on platforms where it
+ isn't the default.
Changes to the MIPS Target
@@ -89,7 +94,10 @@ Changes to the PowerPC Target
Changes to the X86 Target
-------------------------
- During this release ...
+During this release ...
+
+* Got support for enabling SjLj exception handling on platforms where it
+ isn't the default.
Changes to the AMDGPU Target
-----------------------------
@@ -116,8 +124,46 @@ Changes to the C API
External Open Source Projects Using LLVM 6
==========================================
-* A project...
-
+JFS - JIT Fuzzing Solver
+------------------------
+
+`JFS <https://github.com/delcypher/jfs>`_ is an experimental constraint solver
+designed to investigate using coverage guided fuzzing as an incomplete strategy
+for solving boolean, BitVector, and floating-point constraints.
+It is built on top of LLVM, Clang, LibFuzzer, and Z3.
+
+The solver works by generating a C++ program where the reachability of an
+`abort()` statement is equivalent to finding a satisfying assignment to the
+constraints. This program is then compiled by Clang with `SanitizerCoverage
+<https://releases.llvm.org/6.0.0/tools/clang/docs/SanitizerCoverage.html>`_
+instrumentation and then fuzzed using :doc:`LibFuzzer <LibFuzzer>`.
+
+Zig Programming Language
+------------------------
+
+`Zig <http://ziglang.org>`_ is an open-source programming language designed
+for robustness, optimality, and clarity. It is intended to replace C. It
+provides high level features such as Generics,
+Compile Time Function Execution, and Partial Evaluation, yet exposes low level
+LLVM IR features such as Aliases. Zig uses Clang to provide automatic
+import of .h symbols - even inline functions and macros. Zig uses LLD combined
+with lazily building compiler-rt to provide out-of-the-box cross-compiling for
+all supported targets.
+
+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X, FreeBSD and Windows and also Linux on ARM
+and PowerPC (32/64 bit). Ports to other architectures like AArch64 and MIPS64
+are underway.
Additional Information
======================
diff --git a/include/llvm/Analysis/RegionInfoImpl.h b/include/llvm/Analysis/RegionInfoImpl.h
index 6e522354dd9b..eb6baac2d5e4 100644
--- a/include/llvm/Analysis/RegionInfoImpl.h
+++ b/include/llvm/Analysis/RegionInfoImpl.h
@@ -254,23 +254,23 @@ std::string RegionBase<Tr>::getNameStr() const {
template <class Tr>
void RegionBase<Tr>::verifyBBInRegion(BlockT *BB) const {
if (!contains(BB))
- llvm_unreachable("Broken region found: enumerated BB not in region!");
+ report_fatal_error("Broken region found: enumerated BB not in region!");
BlockT *entry = getEntry(), *exit = getExit();
for (BlockT *Succ :
make_range(BlockTraits::child_begin(BB), BlockTraits::child_end(BB))) {
if (!contains(Succ) && exit != Succ)
- llvm_unreachable("Broken region found: edges leaving the region must go "
- "to the exit node!");
+ report_fatal_error("Broken region found: edges leaving the region must go "
+ "to the exit node!");
}
if (entry != BB) {
for (BlockT *Pred : make_range(InvBlockTraits::child_begin(BB),
InvBlockTraits::child_end(BB))) {
if (!contains(Pred))
- llvm_unreachable("Broken region found: edges entering the region must "
- "go to the entry node!");
+ report_fatal_error("Broken region found: edges entering the region must "
+ "go to the entry node!");
}
}
}
@@ -557,7 +557,7 @@ void RegionInfoBase<Tr>::verifyBBMap(const RegionT *R) const {
} else {
BlockT *BB = Element->template getNodeAs<BlockT>();
if (getRegionFor(BB) != R)
- llvm_unreachable("BB map does not match region nesting");
+ report_fatal_error("BB map does not match region nesting");
}
}
}
diff --git a/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h b/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
index 18e4c7a83def..580606441a9d 100644
--- a/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
+++ b/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
@@ -56,7 +56,7 @@ public:
int64_t &Off);
/// Parses tree in Ptr for base, index, offset addresses.
- static BaseIndexOffset match(SDValue Ptr, const SelectionDAG &DAG);
+ static BaseIndexOffset match(LSBaseSDNode *N, const SelectionDAG &DAG);
};
} // end namespace llvm
diff --git a/include/llvm/MC/MCCodeView.h b/include/llvm/MC/MCCodeView.h
index e2249f49c86c..c8f14515ed34 100644
--- a/include/llvm/MC/MCCodeView.h
+++ b/include/llvm/MC/MCCodeView.h
@@ -177,13 +177,7 @@ public:
unsigned IACol);
/// Retreive the function info if this is a valid function id, or nullptr.
- MCCVFunctionInfo *getCVFunctionInfo(unsigned FuncId) {
- if (FuncId >= Functions.size())
- return nullptr;
- if (Functions[FuncId].isUnallocatedFunctionInfo())
- return nullptr;
- return &Functions[FuncId];
- }
+ MCCVFunctionInfo *getCVFunctionInfo(unsigned FuncId);
/// Saves the information from the currently parsed .cv_loc directive
/// and sets CVLocSeen. When the next instruction is assembled an entry
@@ -199,50 +193,22 @@ public:
CurrentCVLoc.setIsStmt(IsStmt);
CVLocSeen = true;
}
- void clearCVLocSeen() { CVLocSeen = false; }
bool getCVLocSeen() { return CVLocSeen; }
+ void clearCVLocSeen() { CVLocSeen = false; }
+
const MCCVLoc &getCurrentCVLoc() { return CurrentCVLoc; }
bool isValidCVFileNumber(unsigned FileNumber);
/// \brief Add a line entry.
- void addLineEntry(const MCCVLineEntry &LineEntry) {
- size_t Offset = MCCVLines.size();
- auto I = MCCVLineStartStop.insert(
- {LineEntry.getFunctionId(), {Offset, Offset + 1}});
- if (!I.second)
- I.first->second.second = Offset + 1;
- MCCVLines.push_back(LineEntry);
- }
+ void addLineEntry(const MCCVLineEntry &LineEntry);
- std::vector<MCCVLineEntry> getFunctionLineEntries(unsigned FuncId) {
- std::vector<MCCVLineEntry> FilteredLines;
+ std::vector<MCCVLineEntry> getFunctionLineEntries(unsigned FuncId);
- auto I = MCCVLineStartStop.find(FuncId);
- if (I != MCCVLineStartStop.end())
- for (size_t Idx = I->second.first, End = I->second.second; Idx != End;
- ++Idx)
- if (MCCVLines[Idx].getFunctionId() == FuncId)
- FilteredLines.push_back(MCCVLines[Idx]);
- return FilteredLines;
- }
-
- std::pair<size_t, size_t> getLineExtent(unsigned FuncId) {
- auto I = MCCVLineStartStop.find(FuncId);
- // Return an empty extent if there are no cv_locs for this function id.
- if (I == MCCVLineStartStop.end())
- return {~0ULL, 0};
- return I->second;
- }
+ std::pair<size_t, size_t> getLineExtent(unsigned FuncId);
- ArrayRef<MCCVLineEntry> getLinesForExtent(size_t L, size_t R) {
- if (R <= L)
- return None;
- if (L >= MCCVLines.size())
- return None;
- return makeArrayRef(&MCCVLines[L], R - L);
- }
+ ArrayRef<MCCVLineEntry> getLinesForExtent(size_t L, size_t R);
/// Emits a line table substream.
void emitLineTableForFunction(MCObjectStreamer &OS, unsigned FuncId,
diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h
index 8f801662d0fb..25175fe66aa8 100644
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@@ -628,7 +628,7 @@ struct SemiNCAInfo {
DecreasingLevel>
Bucket; // Queue of tree nodes sorted by level in descending order.
SmallDenseSet<TreeNodePtr, 8> Affected;
- SmallDenseSet<TreeNodePtr, 8> Visited;
+ SmallDenseMap<TreeNodePtr, unsigned, 8> Visited;
SmallVector<TreeNodePtr, 8> AffectedQueue;
SmallVector<TreeNodePtr, 8> VisitedNotAffectedQueue;
};
@@ -706,7 +706,7 @@ struct SemiNCAInfo {
// algorithm does not really know or use the set of roots and can make a
// different (implicit) decision about which nodes within an infinite loop
// becomes a root.
- if (DT.isVirtualRoot(TN->getIDom())) {
+ if (TN && !DT.isVirtualRoot(TN->getIDom())) {
DEBUG(dbgs() << "Root " << BlockNamePrinter(R)
<< " is not virtual root's child\n"
<< "The entire tree needs to be rebuilt\n");
@@ -753,14 +753,16 @@ struct SemiNCAInfo {
while (!II.Bucket.empty()) {
const TreeNodePtr CurrentNode = II.Bucket.top().second;
+ const unsigned CurrentLevel = CurrentNode->getLevel();
II.Bucket.pop();
DEBUG(dbgs() << "\tAdding to Visited and AffectedQueue: "
<< BlockNamePrinter(CurrentNode) << "\n");
- II.Visited.insert(CurrentNode);
+
+ II.Visited.insert({CurrentNode, CurrentLevel});
II.AffectedQueue.push_back(CurrentNode);
// Discover and collect affected successors of the current node.
- VisitInsertion(DT, BUI, CurrentNode, CurrentNode->getLevel(), NCD, II);
+ VisitInsertion(DT, BUI, CurrentNode, CurrentLevel, NCD, II);
}
// Finish by updating immediate dominators and levels.
@@ -772,13 +774,17 @@ struct SemiNCAInfo {
const TreeNodePtr TN, const unsigned RootLevel,
const TreeNodePtr NCD, InsertionInfo &II) {
const unsigned NCDLevel = NCD->getLevel();
- DEBUG(dbgs() << "Visiting " << BlockNamePrinter(TN) << "\n");
+ DEBUG(dbgs() << "Visiting " << BlockNamePrinter(TN) << ", RootLevel "
+ << RootLevel << "\n");
SmallVector<TreeNodePtr, 8> Stack = {TN};
assert(TN->getBlock() && II.Visited.count(TN) && "Preconditions!");
+ SmallPtrSet<TreeNodePtr, 8> Processed;
+
do {
TreeNodePtr Next = Stack.pop_back_val();
+ DEBUG(dbgs() << " Next: " << BlockNamePrinter(Next) << "\n");
for (const NodePtr Succ :
ChildrenGetter<IsPostDom>::Get(Next->getBlock(), BUI)) {
@@ -786,19 +792,31 @@ struct SemiNCAInfo {
assert(SuccTN && "Unreachable successor found at reachable insertion");
const unsigned SuccLevel = SuccTN->getLevel();
- DEBUG(dbgs() << "\tSuccessor " << BlockNamePrinter(Succ)
- << ", level = " << SuccLevel << "\n");
+ DEBUG(dbgs() << "\tSuccessor " << BlockNamePrinter(Succ) << ", level = "
+ << SuccLevel << "\n");
+
+ // Do not process the same node multiple times.
+ if (Processed.count(Next) > 0)
+ continue;
// Succ dominated by subtree From -- not affected.
// (Based on the lemma 2.5 from the second paper.)
if (SuccLevel > RootLevel) {
DEBUG(dbgs() << "\t\tDominated by subtree From\n");
- if (II.Visited.count(SuccTN) != 0)
- continue;
+ if (II.Visited.count(SuccTN) != 0) {
+ DEBUG(dbgs() << "\t\t\talready visited at level "
+ << II.Visited[SuccTN] << "\n\t\t\tcurrent level "
+ << RootLevel << ")\n");
+
+ // A node can be necessary to visit again if we see it again at
+ // a lower level than before.
+ if (II.Visited[SuccTN] >= RootLevel)
+ continue;
+ }
DEBUG(dbgs() << "\t\tMarking visited not affected "
<< BlockNamePrinter(Succ) << "\n");
- II.Visited.insert(SuccTN);
+ II.Visited.insert({SuccTN, RootLevel});
II.VisitedNotAffectedQueue.push_back(SuccTN);
Stack.push_back(SuccTN);
} else if ((SuccLevel > NCDLevel + 1) &&
@@ -809,6 +827,8 @@ struct SemiNCAInfo {
II.Bucket.push({SuccLevel, SuccTN});
}
}
+
+ Processed.insert(Next);
} while (!Stack.empty());
}
@@ -920,21 +940,21 @@ struct SemiNCAInfo {
const NodePtr NCDBlock = DT.findNearestCommonDominator(From, To);
const TreeNodePtr NCD = DT.getNode(NCDBlock);
- // To dominates From -- nothing to do.
- if (ToTN == NCD) return;
+ // If To dominates From -- nothing to do.
+ if (ToTN != NCD) {
+ DT.DFSInfoValid = false;
- DT.DFSInfoValid = false;
-
- const TreeNodePtr ToIDom = ToTN->getIDom();
- DEBUG(dbgs() << "\tNCD " << BlockNamePrinter(NCD) << ", ToIDom "
- << BlockNamePrinter(ToIDom) << "\n");
+ const TreeNodePtr ToIDom = ToTN->getIDom();
+ DEBUG(dbgs() << "\tNCD " << BlockNamePrinter(NCD) << ", ToIDom "
+ << BlockNamePrinter(ToIDom) << "\n");
- // To remains reachable after deletion.
- // (Based on the caption under Figure 4. from the second paper.)
- if (FromTN != ToIDom || HasProperSupport(DT, BUI, ToTN))
- DeleteReachable(DT, BUI, FromTN, ToTN);
- else
- DeleteUnreachable(DT, BUI, ToTN);
+ // To remains reachable after deletion.
+ // (Based on the caption under Figure 4. from the second paper.)
+ if (FromTN != ToIDom || HasProperSupport(DT, BUI, ToTN))
+ DeleteReachable(DT, BUI, FromTN, ToTN);
+ else
+ DeleteUnreachable(DT, BUI, ToTN);
+ }
if (IsPostDom) UpdateRootsAfterUpdate(DT, BUI);
}
diff --git a/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 25f264c4722c..781a628a0974 100644
--- a/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -95,14 +95,9 @@ private:
bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);
/// \brief Try to vectorize a list of operands.
- /// \@param BuildVector A list of users to ignore for the purpose of
- /// scheduling and cost estimation when NeedExtraction
- /// is false.
/// \returns true if a value was vectorized.
bool tryToVectorizeList(ArrayRef<Value *> VL, slpvectorizer::BoUpSLP &R,
- ArrayRef<Value *> BuildVector = None,
- bool AllowReorder = false,
- bool NeedExtraction = false);
+ bool AllowReorder = false);
/// \brief Try to vectorize a chain that may start at the operands of \p I.
bool tryToVectorize(Instruction *I, slpvectorizer::BoUpSLP &R);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 9dc1ab4e6bb5..26ca8d4ee88c 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -2700,8 +2700,13 @@ public:
// we still need to collect it due to original value is different.
// And later we will need all original values as anchors during
// finding the common Phi node.
+ // We also must reject the case when base offset is different and
+ // scale reg is not null, we cannot handle this case due to merge of
+ // different offsets will be used as ScaleReg.
if (DifferentField != ExtAddrMode::MultipleFields &&
- DifferentField != ExtAddrMode::ScaleField) {
+ DifferentField != ExtAddrMode::ScaleField &&
+ (DifferentField != ExtAddrMode::BaseOffsField ||
+ !NewAddrMode.ScaledReg)) {
AddrModes.emplace_back(NewAddrMode);
return true;
}
diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp
index 8b9545da914e..3888226fa059 100644
--- a/lib/CodeGen/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp
@@ -577,7 +577,8 @@ bool GlobalMerge::doInitialization(Module &M) {
for (auto &GV : M.globals()) {
// Merge is safe for "normal" internal or external globals only
if (GV.isDeclaration() || GV.isThreadLocal() ||
- GV.hasSection() || GV.hasImplicitSection())
+ GV.hasSection() || GV.hasImplicitSection() ||
+ GV.hasDLLExportStorageClass())
continue;
// It's not safe to merge globals that may be preempted
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 45078081987a..11acbe687a31 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -719,15 +719,14 @@ bool PeepholeOptimizer::findNextSource(unsigned Reg, unsigned SubReg,
CurSrcPair = Pair;
ValueTracker ValTracker(CurSrcPair.Reg, CurSrcPair.SubReg, *MRI,
!DisableAdvCopyOpt, TII);
- ValueTrackerResult Res;
- bool ShouldRewrite = false;
- do {
- // Follow the chain of copies until we reach the top of the use-def chain
- // or find a more suitable source.
- Res = ValTracker.getNextSource();
+ // Follow the chain of copies until we find a more suitable source, a phi
+ // or have to abort.
+ while (true) {
+ ValueTrackerResult Res = ValTracker.getNextSource();
+ // Abort at the end of a chain (without finding a suitable source).
if (!Res.isValid())
- break;
+ return false;
// Insert the Def -> Use entry for the recently found source.
ValueTrackerResult CurSrcRes = RewriteMap.lookup(CurSrcPair);
@@ -763,24 +762,19 @@ bool PeepholeOptimizer::findNextSource(unsigned Reg, unsigned SubReg,
if (TargetRegisterInfo::isPhysicalRegister(CurSrcPair.Reg))
return false;
+ // Keep following the chain if the value isn't any better yet.
const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg);
- ShouldRewrite = TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC,
- CurSrcPair.SubReg);
- } while (!ShouldRewrite);
-
- // Continue looking for new sources...
- if (Res.isValid())
- continue;
+ if (!TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC, CurSrcPair.SubReg))
+ continue;
- // Do not continue searching for a new source if the there's at least
- // one use-def which cannot be rewritten.
- if (!ShouldRewrite)
- return false;
- }
+ // We currently cannot deal with subreg operands on PHI instructions
+ // (see insertPHI()).
+ if (PHICount > 0 && CurSrcPair.SubReg != 0)
+ continue;
- if (PHICount >= RewritePHILimit) {
- DEBUG(dbgs() << "findNextSource: PHI limit reached\n");
- return false;
+ // We found a suitable source, and are done with this chain.
+ break;
+ }
}
// If we did not find a more suitable source, there is nothing to optimize.
@@ -799,6 +793,9 @@ insertPHI(MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
assert(!SrcRegs.empty() && "No sources to create a PHI instruction?");
const TargetRegisterClass *NewRC = MRI->getRegClass(SrcRegs[0].Reg);
+ // NewRC is only correct if no subregisters are involved. findNextSource()
+ // should have rejected those cases already.
+ assert(SrcRegs[0].SubReg == 0 && "should not have subreg operand");
unsigned NewVR = MRI->createVirtualRegister(NewRC);
MachineBasicBlock *MBB = OrigPHI->getParent();
MachineInstrBuilder MIB = BuildMI(*MBB, OrigPHI, OrigPHI->getDebugLoc(),
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 81bff4d7eefa..2c6b724c02df 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3842,9 +3842,16 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N,
EVT ExtVT;
if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) &&
isLegalNarrowLoad(Load, ISD::ZEXTLOAD, ExtVT)) {
- // Only add this load if we can make it more narrow.
- if (ExtVT.bitsLT(Load->getMemoryVT()))
+
+ // ZEXTLOAD is already small enough.
+ if (Load->getExtensionType() == ISD::ZEXTLOAD &&
+ ExtVT.bitsGE(Load->getMemoryVT()))
+ continue;
+
+ // Use LE to convert equal sized loads to zext.
+ if (ExtVT.bitsLE(Load->getMemoryVT()))
Loads.insert(Load);
+
continue;
}
return false;
@@ -3899,11 +3906,13 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) {
if (Loads.size() == 0)
return false;
+ DEBUG(dbgs() << "Backwards propagate AND: "; N->dump());
SDValue MaskOp = N->getOperand(1);
// If it exists, fixup the single node we allow in the tree that needs
// masking.
if (FixupNode) {
+ DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
FixupNode->getValueType(0),
SDValue(FixupNode, 0), MaskOp);
@@ -3914,14 +3923,21 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) {
// Narrow any constants that need it.
for (auto *LogicN : NodesWithConsts) {
- auto *C = cast<ConstantSDNode>(LogicN->getOperand(1));
- SDValue And = DAG.getNode(ISD::AND, SDLoc(C), C->getValueType(0),
- SDValue(C, 0), MaskOp);
- DAG.UpdateNodeOperands(LogicN, LogicN->getOperand(0), And);
+ SDValue Op0 = LogicN->getOperand(0);
+ SDValue Op1 = LogicN->getOperand(1);
+
+ if (isa<ConstantSDNode>(Op0))
+ std::swap(Op0, Op1);
+
+ SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(),
+ Op1, MaskOp);
+
+ DAG.UpdateNodeOperands(LogicN, Op0, And);
}
// Create narrow loads.
for (auto *Load : Loads) {
+ DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
SDValue(Load, 0), MaskOp);
DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
@@ -5209,7 +5225,7 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
return SDValue();
// Loads must share the same base address
- BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG);
+ BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG);
int64_t ByteOffsetFromBase = 0;
if (!Base)
Base = Ptr;
@@ -12928,7 +12944,7 @@ void DAGCombiner::getStoreMergeCandidates(
StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) {
// This holds the base pointer, index, and the offset in bytes from the base
// pointer.
- BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
+ BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
EVT MemVT = St->getMemoryVT();
SDValue Val = peekThroughBitcast(St->getValue());
@@ -12949,7 +12965,7 @@ void DAGCombiner::getStoreMergeCandidates(
EVT LoadVT;
if (IsLoadSrc) {
auto *Ld = cast<LoadSDNode>(Val);
- LBasePtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
+ LBasePtr = BaseIndexOffset::match(Ld, DAG);
LoadVT = Ld->getMemoryVT();
// Load and store should be the same type.
if (MemVT != LoadVT)
@@ -12968,7 +12984,7 @@ void DAGCombiner::getStoreMergeCandidates(
return false;
// The Load's Base Ptr must also match
if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Val)) {
- auto LPtr = BaseIndexOffset::match(OtherLd->getBasePtr(), DAG);
+ auto LPtr = BaseIndexOffset::match(OtherLd, DAG);
if (LoadVT != OtherLd->getMemoryVT())
return false;
if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
@@ -12992,7 +13008,7 @@ void DAGCombiner::getStoreMergeCandidates(
Val.getOpcode() != ISD::EXTRACT_SUBVECTOR)
return false;
}
- Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG);
+ Ptr = BaseIndexOffset::match(Other, DAG);
return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
};
@@ -13365,7 +13381,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
if (Ld->getMemoryVT() != MemVT)
break;
- BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
+ BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);
// If this is not the first ptr that we check.
int64_t LdOffset = 0;
if (LdBasePtr.getBase().getNode()) {
@@ -17432,44 +17448,46 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
unsigned NumBytes1 = Op1->getMemoryVT().getStoreSize();
// Check for BaseIndexOffset matching.
- BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0->getBasePtr(), DAG);
- BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1->getBasePtr(), DAG);
+ BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0, DAG);
+ BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1, DAG);
int64_t PtrDiff;
- if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff))
- return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0));
-
- // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
- // able to calculate their relative offset if at least one arises
- // from an alloca. However, these allocas cannot overlap and we
- // can infer there is no alias.
- if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase()))
- if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) {
- MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
- // If the base are the same frame index but the we couldn't find a
- // constant offset, (indices are different) be conservative.
- if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) ||
- !MFI.isFixedObjectIndex(B->getIndex())))
- return false;
- }
-
- bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase());
- bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase());
- bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase());
- bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase());
- bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase());
- bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase());
+ if (BasePtr0.getBase().getNode() && BasePtr1.getBase().getNode()) {
+ if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff))
+ return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0));
+
+ // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
+ // able to calculate their relative offset if at least one arises
+ // from an alloca. However, these allocas cannot overlap and we
+ // can infer there is no alias.
+ if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase()))
+ if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) {
+ MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ // If the base are the same frame index but the we couldn't find a
+ // constant offset, (indices are different) be conservative.
+ if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) ||
+ !MFI.isFixedObjectIndex(B->getIndex())))
+ return false;
+ }
- // If of mismatched base types or checkable indices we can check
- // they do not alias.
- if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) ||
- (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) &&
- (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1))
- return false;
+ bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase());
+ bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase());
+ bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase());
+ bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase());
+ bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase());
+ bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase());
+
+ // If of mismatched base types or checkable indices we can check
+ // they do not alias.
+ if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) ||
+ (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) &&
+ (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1))
+ return false;
+ }
- // If we know required SrcValue1 and SrcValue2 have relatively large alignment
- // compared to the size and offset of the access, we may be able to prove they
- // do not alias. This check is conservative for now to catch cases created by
- // splitting vector types.
+ // If we know required SrcValue1 and SrcValue2 have relatively large
+ // alignment compared to the size and offset of the access, we may be able
+ // to prove they do not alias. This check is conservative for now to catch
+ // cases created by splitting vector types.
int64_t SrcValOffset0 = Op0->getSrcValueOffset();
int64_t SrcValOffset1 = Op1->getSrcValueOffset();
unsigned OrigAlignment0 = Op0->getOriginalAlignment();
@@ -17479,8 +17497,8 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0;
int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1;
- // There is no overlap between these relatively aligned accesses of similar
- // size. Return no alias.
+ // There is no overlap between these relatively aligned accesses of
+ // similar size. Return no alias.
if ((OffAlign0 + NumBytes0) <= OffAlign1 ||
(OffAlign1 + NumBytes1) <= OffAlign0)
return false;
@@ -17643,7 +17661,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
// This holds the base pointer, index, and the offset in bytes from the base
// pointer.
- BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
+ BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
// We must have a base and an offset.
if (!BasePtr.getBase().getNode())
@@ -17669,7 +17687,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
break;
// Find the base pointer and offset for this memory node.
- BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
+ BaseIndexOffset Ptr = BaseIndexOffset::match(Index, DAG);
// Check that the base pointer is the same as the original one.
if (!BasePtr.equalBaseIndex(Ptr, DAG))
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bb1dc17b7a1b..b566c232cbc3 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2965,12 +2965,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
case ISD::ZERO_EXTEND:
LHS = DAG.getNode(ISD::AssertZext, dl, OuterType, Res,
DAG.getValueType(AtomicType));
- RHS = DAG.getNode(ISD::ZERO_EXTEND, dl, OuterType, Node->getOperand(2));
+ RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType);
ExtRes = LHS;
break;
case ISD::ANY_EXTEND:
LHS = DAG.getZeroExtendInReg(Res, dl, AtomicType);
- RHS = DAG.getNode(ISD::ZERO_EXTEND, dl, OuterType, Node->getOperand(2));
+ RHS = DAG.getZeroExtendInReg(Node->getOperand(2), dl, AtomicType);
break;
default:
llvm_unreachable("Invalid atomic op extension");
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 4c8b63d2f239..3ffc6fa9a059 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7947,11 +7947,8 @@ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
if (VT.getSizeInBits() / 8 != Bytes)
return false;
- SDValue Loc = LD->getOperand(1);
- SDValue BaseLoc = Base->getOperand(1);
-
- auto BaseLocDecomp = BaseIndexOffset::match(BaseLoc, *this);
- auto LocDecomp = BaseIndexOffset::match(Loc, *this);
+ auto BaseLocDecomp = BaseIndexOffset::match(Base, *this);
+ auto LocDecomp = BaseIndexOffset::match(LD, *this);
int64_t Offset = 0;
if (BaseLocDecomp.equalBaseIndex(LocDecomp, *this, Offset))
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index d5980919d03c..da1574f60524 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -21,6 +21,9 @@ using namespace llvm;
bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
const SelectionDAG &DAG, int64_t &Off) {
+ // Conservatively fail if we a match failed..
+ if (!Base.getNode() || !Other.Base.getNode())
+ return false;
// Initial Offset difference.
Off = Other.Offset - Offset;
@@ -72,13 +75,29 @@ bool BaseIndexOffset::equalBaseIndex(BaseIndexOffset &Other,
}
/// Parses tree in Ptr for base, index, offset addresses.
-BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) {
+BaseIndexOffset BaseIndexOffset::match(LSBaseSDNode *N,
+ const SelectionDAG &DAG) {
+ SDValue Ptr = N->getBasePtr();
+
// (((B + I*M) + c)) + c ...
SDValue Base = DAG.getTargetLoweringInfo().unwrapAddress(Ptr);
SDValue Index = SDValue();
int64_t Offset = 0;
bool IsIndexSignExt = false;
+ // pre-inc/pre-dec ops are components of EA.
+ if (N->getAddressingMode() == ISD::PRE_INC) {
+ if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
+ Offset += C->getSExtValue();
+ else // If unknown, give up now.
+ return BaseIndexOffset(SDValue(), SDValue(), 0, false);
+ } else if (N->getAddressingMode() == ISD::PRE_DEC) {
+ if (auto *C = dyn_cast<ConstantSDNode>(N->getOffset()))
+ Offset -= C->getSExtValue();
+ else // If unknown, give up now.
+ return BaseIndexOffset(SDValue(), SDValue(), 0, false);
+ }
+
// Consume constant adds & ors with appropriate masking.
while (Base->getOpcode() == ISD::ADD || Base->getOpcode() == ISD::OR) {
if (auto *C = dyn_cast<ConstantSDNode>(Base->getOperand(1))) {
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 224ae1a3236a..b29a33ac1c14 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -132,9 +132,18 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) {
setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
- // Darwin 10 and higher has an optimized __bzero.
- if (!TT.isMacOSX() || !TT.isMacOSXVersionLT(10, 6) || TT.isArch64Bit()) {
- setLibcallName(RTLIB::BZERO, TT.isAArch64() ? "bzero" : "__bzero");
+ // Some darwins have an optimized __bzero/bzero function.
+ switch (TT.getArch()) {
+ case Triple::x86:
+ case Triple::x86_64:
+ if (TT.isMacOSX() && !TT.isMacOSXVersionLT(10, 6))
+ setLibcallName(RTLIB::BZERO, "__bzero");
+ break;
+ case Triple::aarch64:
+ setLibcallName(RTLIB::BZERO, "bzero");
+ break;
+ default:
+ break;
}
if (darwinHasSinCos(TT)) {
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index ee067a912e3c..f7170e714b9b 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -954,7 +954,12 @@ Expected<Constant *> IRLinker::linkGlobalValueProto(GlobalValue *SGV,
NewGV->setLinkage(GlobalValue::InternalLinkage);
Constant *C = NewGV;
- if (DGV)
+ // Only create a bitcast if necessary. In particular, with
+ // DebugTypeODRUniquing we may reach metadata in the destination module
+ // containing a GV from the source module, in which case SGV will be
+ // the same as DGV and NewGV, and TypeMap.get() will assert since it
+ // assumes it is being invoked on a type in the source module.
+ if (DGV && NewGV != SGV)
C = ConstantExpr::getBitCast(NewGV, TypeMap.get(SGV->getType()));
if (DGV && NewGV != DGV) {
diff --git a/lib/MC/MCCodeView.cpp b/lib/MC/MCCodeView.cpp
index 82b81ccc24da..5fd5bde9f1eb 100644
--- a/lib/MC/MCCodeView.cpp
+++ b/lib/MC/MCCodeView.cpp
@@ -76,6 +76,14 @@ bool CodeViewContext::addFile(MCStreamer &OS, unsigned FileNumber,
return true;
}
+MCCVFunctionInfo *CodeViewContext::getCVFunctionInfo(unsigned FuncId) {
+ if (FuncId >= Functions.size())
+ return nullptr;
+ if (Functions[FuncId].isUnallocatedFunctionInfo())
+ return nullptr;
+ return &Functions[FuncId];
+}
+
bool CodeViewContext::recordFunctionId(unsigned FuncId) {
if (FuncId >= Functions.size())
Functions.resize(FuncId + 1);
@@ -247,6 +255,67 @@ void CodeViewContext::emitFileChecksumOffset(MCObjectStreamer &OS,
OS.EmitValueImpl(SRE, 4);
}
+void CodeViewContext::addLineEntry(const MCCVLineEntry &LineEntry) {
+ size_t Offset = MCCVLines.size();
+ auto I = MCCVLineStartStop.insert(
+ {LineEntry.getFunctionId(), {Offset, Offset + 1}});
+ if (!I.second)
+ I.first->second.second = Offset + 1;
+ MCCVLines.push_back(LineEntry);
+}
+
+std::vector<MCCVLineEntry>
+CodeViewContext::getFunctionLineEntries(unsigned FuncId) {
+ std::vector<MCCVLineEntry> FilteredLines;
+ auto I = MCCVLineStartStop.find(FuncId);
+ if (I != MCCVLineStartStop.end()) {
+ MCCVFunctionInfo *SiteInfo = getCVFunctionInfo(FuncId);
+ for (size_t Idx = I->second.first, End = I->second.second; Idx != End;
+ ++Idx) {
+ unsigned LocationFuncId = MCCVLines[Idx].getFunctionId();
+ if (LocationFuncId == FuncId) {
+ // This was a .cv_loc directly for FuncId, so record it.
+ FilteredLines.push_back(MCCVLines[Idx]);
+ } else {
+ // Check if the current location is inlined in this function. If it is,
+ // synthesize a statement .cv_loc at the original inlined call site.
+ auto I = SiteInfo->InlinedAtMap.find(LocationFuncId);
+ if (I != SiteInfo->InlinedAtMap.end()) {
+ MCCVFunctionInfo::LineInfo &IA = I->second;
+ // Only add the location if it differs from the previous location.
+ // Large inlined calls will have many .cv_loc entries and we only need
+ // one line table entry in the parent function.
+ if (FilteredLines.empty() ||
+ FilteredLines.back().getFileNum() != IA.File ||
+ FilteredLines.back().getLine() != IA.Line ||
+ FilteredLines.back().getColumn() != IA.Col) {
+ FilteredLines.push_back(MCCVLineEntry(
+ MCCVLines[Idx].getLabel(),
+ MCCVLoc(FuncId, IA.File, IA.Line, IA.Col, false, false)));
+ }
+ }
+ }
+ }
+ }
+ return FilteredLines;
+}
+
+std::pair<size_t, size_t> CodeViewContext::getLineExtent(unsigned FuncId) {
+ auto I = MCCVLineStartStop.find(FuncId);
+ // Return an empty extent if there are no cv_locs for this function id.
+ if (I == MCCVLineStartStop.end())
+ return {~0ULL, 0};
+ return I->second;
+}
+
+ArrayRef<MCCVLineEntry> CodeViewContext::getLinesForExtent(size_t L, size_t R) {
+ if (R <= L)
+ return None;
+ if (L >= MCCVLines.size())
+ return None;
+ return makeArrayRef(&MCCVLines[L], R - L);
+}
+
void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS,
unsigned FuncId,
const MCSymbol *FuncBegin,
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index c2d3ae31c624..b85b4e082996 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -868,6 +868,40 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
if (OpFlags & AArch64II::MO_GOT) {
I.setDesc(TII.get(AArch64::LOADgot));
I.getOperand(1).setTargetFlags(OpFlags);
+ } else if (TM.getCodeModel() == CodeModel::Large) {
+ // Materialize the global using movz/movk instructions.
+ unsigned MovZDstReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ auto InsertPt = std::next(I.getIterator());
+ auto MovZ =
+ BuildMI(MBB, InsertPt, I.getDebugLoc(), TII.get(AArch64::MOVZXi))
+ .addDef(MovZDstReg);
+ MovZ->addOperand(MF, I.getOperand(1));
+ MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
+ AArch64II::MO_NC);
+ MovZ->addOperand(MF, MachineOperand::CreateImm(0));
+ constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
+
+ auto BuildMovK = [&](unsigned SrcReg, unsigned char Flags,
+ unsigned Offset, unsigned ForceDstReg) {
+ unsigned DstReg =
+ ForceDstReg ? ForceDstReg
+ : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ auto MovI = BuildMI(MBB, InsertPt, MovZ->getDebugLoc(),
+ TII.get(AArch64::MOVKXi))
+ .addDef(DstReg)
+ .addReg(SrcReg);
+ MovI->addOperand(MF, MachineOperand::CreateGA(
+ GV, MovZ->getOperand(1).getOffset(), Flags));
+ MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
+ constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
+ return DstReg;
+ };
+ unsigned DstReg = BuildMovK(MovZ->getOperand(0).getReg(),
+ AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
+ DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
+ BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
+ I.eraseFromParent();
+ return true;
} else {
I.setDesc(TII.get(AArch64::MOVaddr));
I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index 740861851185..f08c50540656 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -821,7 +821,6 @@ namespace llvm {
MutableArrayRef<int> NewMask, unsigned Options = None);
OpRef packp(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
MutableArrayRef<int> NewMask);
- OpRef zerous(ShuffleMask SM, OpRef Va, ResultStack &Results);
OpRef vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
ResultStack &Results);
OpRef vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
@@ -1139,25 +1138,6 @@ OpRef HvxSelector::packp(ShuffleMask SM, OpRef Va, OpRef Vb,
return concat(Out[0], Out[1], Results);
}
-OpRef HvxSelector::zerous(ShuffleMask SM, OpRef Va, ResultStack &Results) {
- DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
-
- int VecLen = SM.Mask.size();
- SmallVector<uint8_t,128> UsedBytes(VecLen);
- bool HasUnused = false;
- for (int I = 0; I != VecLen; ++I) {
- if (SM.Mask[I] != -1)
- UsedBytes[I] = 0xFF;
- else
- HasUnused = true;
- }
- if (!HasUnused)
- return Va;
- SDValue B = getVectorConstant(UsedBytes, SDLoc(Results.InpNode));
- Results.push(Hexagon::V6_vand, getSingleVT(MVT::i8), {Va, OpRef(B)});
- return OpRef::res(Results.top());
-}
-
OpRef HvxSelector::vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
ResultStack &Results) {
DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index f9de65fcb1df..f0e8b11a3d9c 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -142,6 +142,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+ // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
+ setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+
// PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
for (MVT VT : MVT::integer_valuetypes()) {
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
@@ -1154,6 +1157,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
case PPCISD::Hi: return "PPCISD::Hi";
case PPCISD::Lo: return "PPCISD::Lo";
case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";
+ case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
+ case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";
case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";
case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
@@ -8834,6 +8839,42 @@ SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
return Op;
}
+// ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
+// compared to a value that is atomically loaded (atomic loads zero-extend).
+SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
+ SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
+ "Expecting an atomic compare-and-swap here.");
+ SDLoc dl(Op);
+ auto *AtomicNode = cast<AtomicSDNode>(Op.getNode());
+ EVT MemVT = AtomicNode->getMemoryVT();
+ if (MemVT.getSizeInBits() >= 32)
+ return Op;
+
+ SDValue CmpOp = Op.getOperand(2);
+ // If this is already correctly zero-extended, leave it alone.
+ auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits());
+ if (DAG.MaskedValueIsZero(CmpOp, HighBits))
+ return Op;
+
+ // Clear the high bits of the compare operand.
+ unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
+ SDValue NewCmpOp =
+ DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp,
+ DAG.getConstant(MaskVal, dl, MVT::i32));
+
+ // Replace the existing compare operand with the properly zero-extended one.
+ SmallVector<SDValue, 4> Ops;
+ for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
+ Ops.push_back(AtomicNode->getOperand(i));
+ Ops[2] = NewCmpOp;
+ MachineMemOperand *MMO = AtomicNode->getMemOperand();
+ SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other);
+ auto NodeTy =
+ (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
+ return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
+}
+
SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -9325,6 +9366,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerREM(Op, DAG);
case ISD::BSWAP:
return LowerBSWAP(Op, DAG);
+ case ISD::ATOMIC_CMP_SWAP:
+ return LowerATOMIC_CMP_SWAP(Op, DAG);
}
}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index b119e5b4a564..b3215a84829e 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -430,6 +430,11 @@ namespace llvm {
/// The 4xf32 load used for v4i1 constants.
QVLFSb,
+ /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes
+ /// except they ensure that the compare input is zero-extended for
+ /// sub-word versions because the atomic loads zero-extend.
+ ATOMIC_CMP_SWAP_8, ATOMIC_CMP_SWAP_16,
+
/// GPRC = TOC_ENTRY GA, TOC
/// Loads the entry for GA from the TOC, where the TOC base is given by
/// the last operand.
@@ -955,6 +960,7 @@ namespace llvm {
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index a932d05b24ee..43dcc4479cf0 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -257,6 +257,13 @@ def PPCvcmp_o : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>;
def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
[SDNPHasChain, SDNPOptInGlue]>;
+// PPC-specific atomic operations.
+def PPCatomicCmpSwap_8 :
+ SDNode<"PPCISD::ATOMIC_CMP_SWAP_8", SDTAtomic3,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+def PPCatomicCmpSwap_16 :
+ SDNode<"PPCISD::ATOMIC_CMP_SWAP_16", SDTAtomic3,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
def PPClbrx : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def PPCstbrx : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
@@ -1710,6 +1717,11 @@ let usesCustomInserter = 1 in {
}
}
+def : Pat<(PPCatomicCmpSwap_8 xoaddr:$ptr, i32:$old, i32:$new),
+ (ATOMIC_CMP_SWAP_I8 xoaddr:$ptr, i32:$old, i32:$new)>;
+def : Pat<(PPCatomicCmpSwap_16 xoaddr:$ptr, i32:$old, i32:$new),
+ (ATOMIC_CMP_SWAP_I16 xoaddr:$ptr, i32:$old, i32:$new)>;
+
// Instructions to support atomic operations
let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
def LBARX : XForm_1<31, 52, (outs gprc:$rD), (ins memrr:$src),
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index f1ce430f3323..f2ffba7d5418 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -2375,6 +2375,13 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
.Cases("repne", "repnz", X86::IP_HAS_REPEAT_NE)
.Default(X86::IP_NO_PREFIX); // Invalid prefix (impossible)
Flags |= Prefix;
+ if (getLexer().is(AsmToken::EndOfStatement)) {
+ // We don't have real instr with the given prefix
+ // let's use the prefix as the instr.
+ // TODO: there could be several prefixes one after another
+ Flags = X86::IP_NO_PREFIX;
+ break;
+ }
Name = Parser.getTok().getString();
Parser.Lex(); // eat the prefix
// Hack: we could have something like "rep # some comment" or
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index a6f56877bd64..e7d9334abe14 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7893,8 +7893,14 @@ LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
VT.getVectorNumElements());
IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
- return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV,
- SDLoc(V), VT, IndicesVec, SrcVec);
+ if (SrcVec.getValueSizeInBits() < IndicesVT.getSizeInBits()) {
+ SrcVec =
+ DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(SrcVec), VT, DAG.getUNDEF(VT),
+ SrcVec, DAG.getIntPtrConstant(0, SDLoc(SrcVec)));
+ }
+ if (VT == MVT::v16i8)
+ return DAG.getNode(X86ISD::PSHUFB, SDLoc(V), VT, SrcVec, IndicesVec);
+ return DAG.getNode(X86ISD::VPERMV, SDLoc(V), VT, IndicesVec, SrcVec);
}
SDValue
@@ -18262,6 +18268,18 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
}
+ // For v64i1 without 64-bit support we need to split and rejoin.
+ if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+ assert(Subtarget.hasBWI() && "Expected BWI to be legal");
+ SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
+ SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
+ SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
+ SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
+ SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
+ SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+ }
+
if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
SDValue Op1Scalar;
if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
@@ -28652,13 +28670,14 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
+ SDValue NewV1 = V1; // Save operand in case early exit happens.
if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
- V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
- ShuffleVT) &&
+ NewV1, DL, DAG, Subtarget, Shuffle,
+ ShuffleSrcVT, ShuffleVT) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- Res = DAG.getBitcast(ShuffleSrcVT, V1);
+ Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
DCI.AddToWorklist(Res.getNode());
@@ -28680,33 +28699,36 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
+ SDValue NewV1 = V1; // Save operands in case early exit happens.
+ SDValue NewV2 = V2;
if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
- V1, V2, DL, DAG, Subtarget, Shuffle,
+ NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- V1 = DAG.getBitcast(ShuffleSrcVT, V1);
- DCI.AddToWorklist(V1.getNode());
- V2 = DAG.getBitcast(ShuffleSrcVT, V2);
- DCI.AddToWorklist(V2.getNode());
- Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
+ NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
+ DCI.AddToWorklist(NewV1.getNode());
+ NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
+ DCI.AddToWorklist(NewV2.getNode());
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
}
- if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
- AllowIntDomain, V1, V2, DL, DAG,
- Subtarget, Shuffle, ShuffleVT,
- PermuteImm) &&
+ NewV1 = V1; // Save operands in case early exit happens.
+ NewV2 = V2;
+ if (matchBinaryPermuteVectorShuffle(
+ MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
+ NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
(!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
return SDValue(); // Nothing to do!
- V1 = DAG.getBitcast(ShuffleVT, V1);
- DCI.AddToWorklist(V1.getNode());
- V2 = DAG.getBitcast(ShuffleVT, V2);
- DCI.AddToWorklist(V2.getNode());
- Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
+ NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
+ DCI.AddToWorklist(NewV1.getNode());
+ NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
+ DCI.AddToWorklist(NewV2.getNode());
+ Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
DAG.getConstant(PermuteImm, DL, MVT::i8));
DCI.AddToWorklist(Res.getNode());
return DAG.getBitcast(RootVT, Res);
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 223eed3048db..967d67a84bc0 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -754,7 +754,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
// type remains the same.
if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
MVT LegalVT = LT.second;
- if (LegalVT.getVectorElementType().getSizeInBits() ==
+ if (LegalVT.isVector() &&
+ LegalVT.getVectorElementType().getSizeInBits() ==
Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp
index c0cd1ea74a74..026fab5dbd3b 100644
--- a/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/lib/Transforms/Scalar/GVNHoist.cpp
@@ -648,7 +648,7 @@ private:
// track in a CHI. In the PDom walk, there can be values in the
// stack which are not control dependent e.g., nested loop.
if (si != RenameStack.end() && si->second.size() &&
- DT->dominates(Pred, si->second.back()->getParent())) {
+ DT->properlyDominates(Pred, si->second.back()->getParent())) {
C.Dest = BB; // Assign the edge
C.I = si->second.pop_back_val(); // Assign the argument
DEBUG(dbgs() << "\nCHI Inserted in BB: " << C.Dest->getName()
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index b8fb80b6cc26..525425bd0f0c 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -14,7 +14,6 @@
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/RegionInfo.h"
#include "llvm/Analysis/RegionIterator.h"
#include "llvm/Analysis/RegionPass.h"
@@ -177,9 +176,8 @@ class StructurizeCFG : public RegionPass {
Region *ParentRegion;
DominatorTree *DT;
- LoopInfo *LI;
- SmallVector<RegionNode *, 8> Order;
+ std::deque<RegionNode *> Order;
BBSet Visited;
BBPhiMap DeletedPhis;
@@ -204,7 +202,7 @@ class StructurizeCFG : public RegionPass {
void gatherPredicates(RegionNode *N);
- void collectInfos();
+ void analyzeNode(RegionNode *N);
void insertConditions(bool Loops);
@@ -258,7 +256,6 @@ public:
AU.addRequired<DivergenceAnalysis>();
AU.addRequiredID(LowerSwitchID);
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
RegionPass::getAnalysisUsage(AU);
@@ -292,55 +289,17 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
/// \brief Build up the general order of nodes
void StructurizeCFG::orderNodes() {
- ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
- SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
-
- // The reverse post-order traversal of the list gives us an ordering close
- // to what we want. The only problem with it is that sometimes backedges
- // for outer loops will be visited before backedges for inner loops.
- for (RegionNode *RN : RPOT) {
- BasicBlock *BB = RN->getEntry();
- Loop *Loop = LI->getLoopFor(BB);
- ++LoopBlocks[Loop];
+ assert(Visited.empty());
+ assert(Predicates.empty());
+ assert(Loops.empty());
+ assert(LoopPreds.empty());
+
+ // This must be RPO order for the back edge detection to work
+ for (RegionNode *RN : ReversePostOrderTraversal<Region*>(ParentRegion)) {
+ // FIXME: Is there a better order to use for structurization?
+ Order.push_back(RN);
+ analyzeNode(RN);
}
-
- unsigned CurrentLoopDepth = 0;
- Loop *CurrentLoop = nullptr;
- for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
- BasicBlock *BB = (*I)->getEntry();
- unsigned LoopDepth = LI->getLoopDepth(BB);
-
- if (is_contained(Order, *I))
- continue;
-
- if (LoopDepth < CurrentLoopDepth) {
- // Make sure we have visited all blocks in this loop before moving back to
- // the outer loop.
-
- auto LoopI = I;
- while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
- LoopI++;
- BasicBlock *LoopBB = (*LoopI)->getEntry();
- if (LI->getLoopFor(LoopBB) == CurrentLoop) {
- --BlockCount;
- Order.push_back(*LoopI);
- }
- }
- }
-
- CurrentLoop = LI->getLoopFor(BB);
- if (CurrentLoop)
- LoopBlocks[CurrentLoop]--;
-
- CurrentLoopDepth = LoopDepth;
- Order.push_back(*I);
- }
-
- // This pass originally used a post-order traversal and then operated on
- // the list in reverse. Now that we are using a reverse post-order traversal
- // rather than re-working the whole pass to operate on the list in order,
- // we just reverse the list and continue to operate on it in reverse.
- std::reverse(Order.begin(), Order.end());
}
/// \brief Determine the end of the loops
@@ -466,32 +425,19 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
}
/// \brief Collect various loop and predicate infos
-void StructurizeCFG::collectInfos() {
- // Reset predicate
- Predicates.clear();
-
- // and loop infos
- Loops.clear();
- LoopPreds.clear();
+void StructurizeCFG::analyzeNode(RegionNode *RN) {
+ DEBUG(dbgs() << "Visiting: "
+ << (RN->isSubRegion() ? "SubRegion with entry: " : "")
+ << RN->getEntry()->getName() << '\n');
- // Reset the visited nodes
- Visited.clear();
-
- for (RegionNode *RN : reverse(Order)) {
- DEBUG(dbgs() << "Visiting: "
- << (RN->isSubRegion() ? "SubRegion with entry: " : "")
- << RN->getEntry()->getName() << " Loop Depth: "
- << LI->getLoopDepth(RN->getEntry()) << "\n");
-
- // Analyze all the conditions leading to a node
- gatherPredicates(RN);
+ // Analyze all the conditions leading to a node
+ gatherPredicates(RN);
- // Remember that we've seen this node
- Visited.insert(RN->getEntry());
+ // Remember that we've seen this node
+ Visited.insert(RN->getEntry());
- // Find the last back edges
- analyzeLoops(RN);
- }
+ // Find the last back edges
+ analyzeLoops(RN);
}
/// \brief Insert the missing branch conditions
@@ -664,7 +610,7 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
LLVMContext &Context = Func->getContext();
BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
- Order.back()->getEntry();
+ Order.front()->getEntry();
BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
Func, Insert);
DT->addNewBlock(Flow, Dominator);
@@ -744,7 +690,8 @@ bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
/// Take one node from the order vector and wire it up
void StructurizeCFG::wireFlow(bool ExitUseAllowed,
BasicBlock *LoopEnd) {
- RegionNode *Node = Order.pop_back_val();
+ RegionNode *Node = Order.front();
+ Order.pop_front();
Visited.insert(Node->getEntry());
if (isPredictableTrue(Node)) {
@@ -768,7 +715,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed,
PrevNode = Node;
while (!Order.empty() && !Visited.count(LoopEnd) &&
- dominatesPredicates(Entry, Order.back())) {
+ dominatesPredicates(Entry, Order.front())) {
handleLoops(false, LoopEnd);
}
@@ -779,7 +726,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed,
void StructurizeCFG::handleLoops(bool ExitUseAllowed,
BasicBlock *LoopEnd) {
- RegionNode *Node = Order.back();
+ RegionNode *Node = Order.front();
BasicBlock *LoopStart = Node->getEntry();
if (!Loops.count(LoopStart)) {
@@ -924,10 +871,9 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
ParentRegion = R;
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
orderNodes();
- collectInfos();
+
createFlow();
insertConditions(false);
insertConditions(true);
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6ef54385c452..64f206ea92eb 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2630,9 +2630,12 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
Instruction *LastInduction = VecInd;
for (unsigned Part = 0; Part < UF; ++Part) {
VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
- recordVectorLoopValueForInductionCast(II, LastInduction, Part);
+
if (isa<TruncInst>(EntryVal))
addMetadata(LastInduction, EntryVal);
+ else
+ recordVectorLoopValueForInductionCast(II, LastInduction, Part);
+
LastInduction = cast<Instruction>(addFastMathFlag(
Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
}
@@ -2754,15 +2757,17 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
// If we haven't yet vectorized the induction variable, splat the scalar
// induction variable, and build the necessary step vectors.
+ // TODO: Don't do it unless the vectorized IV is really required.
if (!VectorizedIV) {
Value *Broadcasted = getBroadcastInstrs(ScalarIV);
for (unsigned Part = 0; Part < UF; ++Part) {
Value *EntryPart =
getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
- recordVectorLoopValueForInductionCast(ID, EntryPart, Part);
if (Trunc)
addMetadata(EntryPart, Trunc);
+ else
+ recordVectorLoopValueForInductionCast(ID, EntryPart, Part);
}
}
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a7ccd3faec44..f301fc361abc 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1347,7 +1347,6 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
Lane << " from " << *Scalar << ".\n");
ExternalUses.emplace_back(Scalar, nullptr, Lane);
- continue;
}
for (User *U : Scalar->users()) {
DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
@@ -4417,13 +4416,11 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B)
return false;
Value *VL[] = { A, B };
- return tryToVectorizeList(VL, R, None, true);
+ return tryToVectorizeList(VL, R, true);
}
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- ArrayRef<Value *> BuildVector,
- bool AllowReorder,
- bool NeedExtraction) {
+ bool AllowReorder) {
if (VL.size() < 2)
return false;
@@ -4517,12 +4514,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
<< "\n");
ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
- ArrayRef<Value *> EmptyArray;
- ArrayRef<Value *> BuildVectorSlice;
- if (!BuildVector.empty())
- BuildVectorSlice = BuildVector.slice(I, OpsWidth);
-
- R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice);
+ R.buildTree(Ops);
// TODO: check if we can allow reordering for more cases.
if (AllowReorder && R.shouldReorder()) {
// Conceptually, there is nothing actually preventing us from trying to
@@ -4530,7 +4522,6 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// reductions. However, at this point, we only expect to get here when
// there are exactly two operations.
assert(Ops.size() == 2);
- assert(BuildVectorSlice.empty());
Value *ReorderedOps[] = {Ops[1], Ops[0]};
R.buildTree(ReorderedOps, None);
}
@@ -4550,31 +4541,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
<< " and with tree size "
<< ore::NV("TreeSize", R.getTreeSize()));
- Value *VectorizedRoot = R.vectorizeTree();
-
- // Reconstruct the build vector by extracting the vectorized root. This
- // way we handle the case where some elements of the vector are
- // undefined.
- // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
- if (!BuildVectorSlice.empty()) {
- // The insert point is the last build vector instruction. The
- // vectorized root will precede it. This guarantees that we get an
- // instruction. The vectorized tree could have been constant folded.
- Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
- unsigned VecIdx = 0;
- for (auto &V : BuildVectorSlice) {
- IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
- ++BasicBlock::iterator(InsertAfter));
- Instruction *I = cast<Instruction>(V);
- assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
- Instruction *Extract =
- cast<Instruction>(Builder.CreateExtractElement(
- VectorizedRoot, Builder.getInt32(VecIdx++)));
- I->setOperand(1, Extract);
- I->moveAfter(Extract);
- InsertAfter = I;
- }
- }
+ R.vectorizeTree();
// Move to the next bundle.
I += VF - 1;
NextInst = I + 1;
@@ -5495,11 +5462,9 @@ private:
///
/// Returns true if it matches
static bool findBuildVector(InsertElementInst *LastInsertElem,
- SmallVectorImpl<Value *> &BuildVector,
SmallVectorImpl<Value *> &BuildVectorOpds) {
Value *V = nullptr;
do {
- BuildVector.push_back(LastInsertElem);
BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
V = LastInsertElem->getOperand(0);
if (isa<UndefValue>(V))
@@ -5508,7 +5473,6 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,
if (!LastInsertElem || !LastInsertElem->hasOneUse())
return false;
} while (true);
- std::reverse(BuildVector.begin(), BuildVector.end());
std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
return true;
}
@@ -5517,11 +5481,9 @@ static bool findBuildVector(InsertElementInst *LastInsertElem,
///
/// \return true if it matches.
static bool findBuildAggregate(InsertValueInst *IV,
- SmallVectorImpl<Value *> &BuildVector,
SmallVectorImpl<Value *> &BuildVectorOpds) {
Value *V;
do {
- BuildVector.push_back(IV);
BuildVectorOpds.push_back(IV->getInsertedValueOperand());
V = IV->getAggregateOperand();
if (isa<UndefValue>(V))
@@ -5530,7 +5492,6 @@ static bool findBuildAggregate(InsertValueInst *IV,
if (!IV || !IV->hasOneUse())
return false;
} while (true);
- std::reverse(BuildVector.begin(), BuildVector.end());
std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
return true;
}
@@ -5706,27 +5667,25 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
if (!R.canMapToVector(IVI->getType(), DL))
return false;
- SmallVector<Value *, 16> BuildVector;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds))
+ if (!findBuildAggregate(IVI, BuildVectorOpds))
return false;
DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
// Aggregate value is unlikely to be processed in vector register, we need to
// extract scalars into scalar registers, so NeedExtraction is set true.
- return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true);
+ return tryToVectorizeList(BuildVectorOpds, R);
}
bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
BasicBlock *BB, BoUpSLP &R) {
- SmallVector<Value *, 16> BuildVector;
SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildVector(IEI, BuildVector, BuildVectorOpds))
+ if (!findBuildVector(IEI, BuildVectorOpds))
return false;
// Vectorize starting with the build vector operands ignoring the BuildVector
// instructions for the purpose of scheduling and user extraction.
- return tryToVectorizeList(BuildVectorOpds, R, BuildVector);
+ return tryToVectorizeList(BuildVectorOpds, R);
}
bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
@@ -5804,8 +5763,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// is done when there are exactly two elements since tryToVectorizeList
// asserts that there are only two values when AllowReorder is true.
bool AllowReorder = NumElts == 2;
- if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
- None, AllowReorder)) {
+ if (NumElts > 1 &&
+ tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
// Success start over because instructions might have been changed.
HaveVectorizedPhiNodes = true;
Changed = true;
diff --git a/test/CodeGen/AArch64/GlobalISel/select-gv-cmodel-large.mir b/test/CodeGen/AArch64/GlobalISel/select-gv-cmodel-large.mir
new file mode 100644
index 000000000000..12cd832665b3
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-gv-cmodel-large.mir
@@ -0,0 +1,61 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -run-pass=instruction-select -verify-machineinstrs -O0 %s -o - | FileCheck %s
+--- |
+ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+
+ @foo1 = common global [1073741824 x i32] zeroinitializer, align 4
+ @foo2 = common global [1073741824 x i32] zeroinitializer, align 4
+
+ define i32 @gv_large() {
+ entry:
+ %retval = alloca i32, align 4
+ store i32 0, i32* %retval, align 4
+ %0 = load i32, i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo1, i64 0, i64 0), align 4
+ %1 = load i32, i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo2, i64 0, i64 0), align 4
+ %add = add nsw i32 %0, %1
+ ret i32 %add
+ }
+
+...
+---
+name: gv_large
+legalized: true
+regBankSelected: true
+stack:
+ - { id: 0, name: retval, type: default, offset: 0, size: 4, alignment: 4,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+constants:
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: gv_large
+ ; CHECK: [[MOVZXi:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) @foo1, 0
+ ; CHECK: [[MOVKXi:%[0-9]+]]:gpr64 = MOVKXi [[MOVZXi]], target-flags(aarch64-g1, aarch64-nc) @foo1, 16
+ ; CHECK: [[MOVKXi1:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi]], target-flags(aarch64-g2, aarch64-nc) @foo1, 32
+ ; CHECK: [[MOVKXi2:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi1]], target-flags(aarch64-g3) @foo1, 48
+ ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY [[MOVKXi2]]
+ ; CHECK: [[MOVZXi1:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) @foo2, 0
+ ; CHECK: [[MOVKXi3:%[0-9]+]]:gpr64 = MOVKXi [[MOVZXi1]], target-flags(aarch64-g1, aarch64-nc) @foo2, 16
+ ; CHECK: [[MOVKXi4:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi3]], target-flags(aarch64-g2, aarch64-nc) @foo2, 32
+ ; CHECK: [[MOVKXi5:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi4]], target-flags(aarch64-g3) @foo2, 48
+ ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[MOVKXi5]]
+ ; CHECK: STRWui %wzr, %stack.0.retval, 0 :: (store 4 into %ir.retval)
+ ; CHECK: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui [[COPY]], 0 :: (load 4 from `i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo1, i64 0, i64 0)`)
+ ; CHECK: [[LDRWui1:%[0-9]+]]:gpr32 = LDRWui [[COPY1]], 0 :: (load 4 from `i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo2, i64 0, i64 0)`)
+ ; CHECK: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRWui]], [[LDRWui1]]
+ ; CHECK: %w0 = COPY [[ADDWrr]]
+ ; CHECK: RET_ReallyLR implicit %w0
+ %1:gpr(s32) = G_CONSTANT i32 0
+ %4:gpr(p0) = G_GLOBAL_VALUE @foo1
+ %3:gpr(p0) = COPY %4(p0)
+ %7:gpr(p0) = G_GLOBAL_VALUE @foo2
+ %6:gpr(p0) = COPY %7(p0)
+ %0:gpr(p0) = G_FRAME_INDEX %stack.0.retval
+ G_STORE %1(s32), %0(p0) :: (store 4 into %ir.retval)
+ %2:gpr(s32) = G_LOAD %3(p0) :: (load 4 from `i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo1, i64 0, i64 0)`)
+ %5:gpr(s32) = G_LOAD %6(p0) :: (load 4 from `i32* getelementptr inbounds ([1073741824 x i32], [1073741824 x i32]* @foo2, i64 0, i64 0)`)
+ %8:gpr(s32) = G_ADD %2, %5
+ %w0 = COPY %8(s32)
+ RET_ReallyLR implicit %w0
+
+...
diff --git a/test/CodeGen/AArch64/atomic-ops-lse.ll b/test/CodeGen/AArch64/atomic-ops-lse.ll
index 49f716547b12..1a5cd2dc4233 100644
--- a/test/CodeGen/AArch64/atomic-ops-lse.ll
+++ b/test/CodeGen/AArch64/atomic-ops-lse.ll
@@ -629,12 +629,27 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
+; CHECK-NEXT: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
+; CHECK-NEXT: casab w0, w1, [x[[ADDR]]]
+; CHECK-NEXT: ret
+
+ ret i8 %old
+}
+
+define i1 @test_atomic_cmpxchg_i8_1(i8 %wanted, i8 %new) nounwind {
+; CHECK-LABEL: test_atomic_cmpxchg_i8_1:
+ %pair = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire acquire
+ %success = extractvalue { i8, i1 } %pair, 1
-; CHECK: casab w[[NEW:[0-9]+]], w[[OLD:[0-9]+]], [x[[ADDR]]]
; CHECK-NOT: dmb
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var8
+; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var8
- ret i8 %old
+; CHECK: casab w[[NEW:[0-9]+]], w1, [x[[ADDR]]]
+; CHECK-NEXT: cmp w[[NEW]], w0, uxtb
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ ret i1 %success
}
define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
@@ -644,12 +659,28 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
; CHECK-NOT: dmb
; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
-; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
+; CHECK-NEXT: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
+; CHECK-NEXT: casah w0, w1, [x[[ADDR]]]
+; CHECK-NEXT: ret
+
+ ret i16 %old
+}
+
+define i1 @test_atomic_cmpxchg_i16_1(i16 %wanted, i16 %new) nounwind {
+; CHECK-LABEL: test_atomic_cmpxchg_i16_1:
+ %pair = cmpxchg i16* @var16, i16 %wanted, i16 %new acquire acquire
+ %success = extractvalue { i16, i1 } %pair, 1
-; CHECK: casah w0, w1, [x[[ADDR]]]
; CHECK-NOT: dmb
+; CHECK: adrp [[TMPADDR:x[0-9]+]], var16
+; CHECK-NEXT: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var16
- ret i16 %old
+; CHECK: casah w[[NEW:[0-9]+]], w1, [x[[ADDR]]]
+; CHECK-NEXT: cmp w[[NEW]], w0, uxth
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+
+ ret i1 %success
}
define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
diff --git a/test/CodeGen/AMDGPU/multilevel-break.ll b/test/CodeGen/AMDGPU/multilevel-break.ll
index 8cc02d497098..5b556f12f0d6 100644
--- a/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -66,9 +66,10 @@ ENDIF: ; preds = %LOOP
; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
; OPT: llvm.amdgcn.break
-; OPT: llvm.amdgcn.loop
+; OPT: llvm.amdgcn.break
; OPT: llvm.amdgcn.if.break
; OPT: llvm.amdgcn.if.break
+; OPT: llvm.amdgcn.loop
; OPT: llvm.amdgcn.end.cf
; GCN-LABEL: {{^}}multi_if_break_loop:
diff --git a/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 672549c8ea63..96d2841e685f 100644
--- a/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -124,55 +124,100 @@ bb23: ; preds = %bb10
; Earlier version of above, before a run of the structurizer.
; IR-LABEL: @nested_loop_conditions(
-; IR: Flow7:
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %17)
-; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %15)
-; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
-; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
-; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow8
+; IR: %tmp1235 = icmp slt i32 %tmp1134, 9
+; IR: br i1 %tmp1235, label %bb14.lr.ph, label %Flow
+
+; IR: bb14.lr.ph:
+; IR: br label %bb14
+
+; IR: Flow3:
+; IR: call void @llvm.amdgcn.end.cf(i64 %18)
+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %17)
+; IR: %1 = extractvalue { i1, i64 } %0, 0
+; IR: %2 = extractvalue { i1, i64 } %0, 1
+; IR: br i1 %1, label %bb4.bb13_crit_edge, label %Flow4
+
+; IR: bb4.bb13_crit_edge:
+; IR: br label %Flow4
+
+; IR: Flow4:
+; IR: %3 = phi i1 [ true, %bb4.bb13_crit_edge ], [ false, %Flow3 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %2)
+; IR: br label %Flow
+
+; IR: bb13:
+; IR: br label %bb31
+
+; IR: Flow:
+; IR: %4 = phi i1 [ %3, %Flow4 ], [ true, %bb ]
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.if(i1 %4)
+; IR: %6 = extractvalue { i1, i64 } %5, 0
+; IR: %7 = extractvalue { i1, i64 } %5, 1
+; IR: br i1 %6, label %bb13, label %bb31
+
+; IR: bb14:
+; IR: %phi.broken = phi i64 [ %18, %Flow2 ], [ 0, %bb14.lr.ph ]
+; IR: %tmp1037 = phi i32 [ %tmp1033, %bb14.lr.ph ], [ %16, %Flow2 ]
+; IR: %tmp936 = phi <4 x i32> [ %tmp932, %bb14.lr.ph ], [ %15, %Flow2 ]
+; IR: %tmp15 = icmp eq i32 %tmp1037, 1
+; IR: %8 = xor i1 %tmp15, true
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
+; IR: %10 = extractvalue { i1, i64 } %9, 0
+; IR: %11 = extractvalue { i1, i64 } %9, 1
+; IR: br i1 %10, label %bb31.loopexit, label %Flow1
; IR: Flow1:
-; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ]
-; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ]
-; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ]
-; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ]
-; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ]
-; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi)
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
-; IR-NEXT: %18 = call i1 @llvm.amdgcn.loop(i64 %17)
-; IR-NEXT: br i1 %18, label %Flow7, label %bb14
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.else(i64 %11)
+; IR: %13 = extractvalue { i1, i64 } %12, 0
+; IR: %14 = extractvalue { i1, i64 } %12, 1
+; IR: br i1 %13, label %bb16, label %Flow2
+
+; IR: bb16:
+; IR: %tmp17 = bitcast i64 %tmp3 to <2 x i32>
+; IR: br label %bb18
; IR: Flow2:
-; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ]
-; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ]
-; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ]
-; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ]
-; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ]
-; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ]
-; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23)
-; IR-NEXT: %25 = extractvalue { i1, i64 } %24, 0
-; IR-NEXT: %26 = extractvalue { i1, i64 } %24, 1
-; IR-NEXT: br i1 %25, label %bb21, label %Flow3
+; IR: %loop.phi = phi i64 [ %21, %bb21 ], [ %phi.broken, %Flow1 ]
+; IR: %15 = phi <4 x i32> [ %tmp9, %bb21 ], [ undef, %Flow1 ]
+; IR: %16 = phi i32 [ %tmp10, %bb21 ], [ undef, %Flow1 ]
+; IR: %17 = phi i1 [ %20, %bb21 ], [ false, %Flow1 ]
+; IR: %18 = call i64 @llvm.amdgcn.else.break(i64 %14, i64 %loop.phi)
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: %19 = call i1 @llvm.amdgcn.loop(i64 %18)
+; IR: br i1 %19, label %Flow3, label %bb14
+
+; IR: bb18:
+; IR: %tmp19 = load volatile i32, i32 addrspace(1)* undef
+; IR: %tmp20 = icmp slt i32 %tmp19, 9
+; IR: br i1 %tmp20, label %bb21, label %bb18
; IR: bb21:
-; IR: %tmp12 = icmp slt i32 %tmp11, 9
-; IR-NEXT: %27 = xor i1 %tmp12, true
-; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken)
-; IR-NEXT: br label %Flow3
-
-; IR: Flow3:
-; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ]
-; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ]
-; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
-; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
-; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ]
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26)
-; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4
+; IR: %tmp22 = extractelement <2 x i32> %tmp17, i64 1
+; IR: %tmp23 = lshr i32 %tmp22, 16
+; IR: %tmp24 = select i1 undef, i32 undef, i32 %tmp23
+; IR: %tmp25 = uitofp i32 %tmp24 to float
+; IR: %tmp26 = fmul float %tmp25, 0x3EF0001000000000
+; IR: %tmp27 = fsub float %tmp26, undef
+; IR: %tmp28 = fcmp olt float %tmp27, 5.000000e-01
+; IR: %tmp29 = select i1 %tmp28, i64 1, i64 2
+; IR: %tmp30 = extractelement <4 x i32> %tmp936, i64 %tmp29
+; IR: %tmp7 = zext i32 %tmp30 to i64
+; IR: %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %tmp7
+; IR: %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
+; IR: %tmp10 = extractelement <4 x i32> %tmp9, i64 0
+; IR: %tmp11 = load volatile i32, i32 addrspace(1)* undef
+; IR: %tmp12 = icmp slt i32 %tmp11, 9
+; IR: %20 = xor i1 %tmp12, true
+; IR: %21 = call i64 @llvm.amdgcn.if.break(i1 %20, i64 %phi.broken)
+; IR: br label %Flow2
+
+; IR: bb31.loopexit:
+; IR: br label %Flow1
; IR: bb31:
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %7)
-; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
-; IR-NEXT: ret void
+; IR: call void @llvm.amdgcn.end.cf(i64 %7)
+; IR: store volatile i32 0, i32 addrspace(1)* undef
+; IR: ret void
; GCN-LABEL: {{^}}nested_loop_conditions:
diff --git a/test/CodeGen/ARM/and-load-combine.ll b/test/CodeGen/ARM/and-load-combine.ll
index 2b92778f469d..69b00ed4853a 100644
--- a/test/CodeGen/ARM/and-load-combine.ll
+++ b/test/CodeGen/ARM/and-load-combine.ll
@@ -852,8 +852,7 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) {
; ARM: @ %bb.0: @ %entry
; ARM-NEXT: ldrb r0, [r0]
; ARM-NEXT: uxtb r2, r2
-; ARM-NEXT: and r0, r0, r1
-; ARM-NEXT: uxtb r1, r0
+; ARM-NEXT: and r1, r0, r1
; ARM-NEXT: mov r0, #0
; ARM-NEXT: cmp r1, r2
; ARM-NEXT: movweq r0, #1
@@ -863,8 +862,7 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) {
; ARMEB: @ %bb.0: @ %entry
; ARMEB-NEXT: ldrb r0, [r0]
; ARMEB-NEXT: uxtb r2, r2
-; ARMEB-NEXT: and r0, r0, r1
-; ARMEB-NEXT: uxtb r1, r0
+; ARMEB-NEXT: and r1, r0, r1
; ARMEB-NEXT: mov r0, #0
; ARMEB-NEXT: cmp r1, r2
; ARMEB-NEXT: movweq r0, #1
@@ -872,9 +870,8 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) {
;
; THUMB1-LABEL: test6:
; THUMB1: @ %bb.0: @ %entry
-; THUMB1-NEXT: ldrb r0, [r0]
-; THUMB1-NEXT: ands r0, r1
-; THUMB1-NEXT: uxtb r3, r0
+; THUMB1-NEXT: ldrb r3, [r0]
+; THUMB1-NEXT: ands r3, r1
; THUMB1-NEXT: uxtb r2, r2
; THUMB1-NEXT: movs r0, #1
; THUMB1-NEXT: movs r1, #0
@@ -889,8 +886,7 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) {
; THUMB2: @ %bb.0: @ %entry
; THUMB2-NEXT: ldrb r0, [r0]
; THUMB2-NEXT: uxtb r2, r2
-; THUMB2-NEXT: ands r0, r1
-; THUMB2-NEXT: uxtb r1, r0
+; THUMB2-NEXT: ands r1, r0
; THUMB2-NEXT: movs r0, #0
; THUMB2-NEXT: cmp r1, r2
; THUMB2-NEXT: it eq
diff --git a/test/CodeGen/ARM/atomic-cmpxchg.ll b/test/CodeGen/ARM/atomic-cmpxchg.ll
index a136e44fc196..fec116677085 100644
--- a/test/CodeGen/ARM/atomic-cmpxchg.ll
+++ b/test/CodeGen/ARM/atomic-cmpxchg.ll
@@ -49,9 +49,10 @@ entry:
; CHECK-THUMBV6: mov [[EXPECTED:r[0-9]+]], r1
; CHECK-THUMBV6-NEXT: bl __sync_val_compare_and_swap_1
; CHECK-THUMBV6-NEXT: mov [[RES:r[0-9]+]], r0
+; CHECK-THUMBV6-NEXT: uxtb [[EXPECTED_ZEXT:r[0-9]+]], [[EXPECTED]]
; CHECK-THUMBV6-NEXT: movs r0, #1
; CHECK-THUMBV6-NEXT: movs [[ZERO:r[0-9]+]], #0
-; CHECK-THUMBV6-NEXT: cmp [[RES]], [[EXPECTED]]
+; CHECK-THUMBV6-NEXT: cmp [[RES]], [[EXPECTED_ZEXT]]
; CHECK-THUMBV6-NEXT: beq [[END:.LBB[0-9_]+]]
; CHECK-THUMBV6-NEXT: mov r0, [[ZERO]]
; CHECK-THUMBV6-NEXT: [[END]]:
diff --git a/test/CodeGen/ARM/cmpxchg-O0.ll b/test/CodeGen/ARM/cmpxchg-O0.ll
index f8ad2bbbbe0e..b49378d6702e 100644
--- a/test/CodeGen/ARM/cmpxchg-O0.ll
+++ b/test/CodeGen/ARM/cmpxchg-O0.ll
@@ -17,7 +17,8 @@ define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
; CHECK: bne [[RETRY]]
; CHECK: [[DONE]]:
-; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED]]
+; CHECK: uxtb [[DESIRED_ZEXT:r[0-9]+]], [[DESIRED]]
+; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED_ZEXT]]
; CHECK: {{moveq|movweq}} {{r[0-9]+}}, #1
; CHECK: dmb ish
%res = cmpxchg i8* %addr, i8 %desired, i8 %new seq_cst monotonic
@@ -36,7 +37,8 @@ define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind
; CHECK: cmp{{(\.w)?}} [[STATUS]], #0
; CHECK: bne [[RETRY]]
; CHECK: [[DONE]]:
-; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED]]
+; CHECK: uxth [[DESIRED_ZEXT:r[0-9]+]], [[DESIRED]]
+; CHECK: cmp{{(\.w)?}} [[OLD]], [[DESIRED_ZEXT]]
; CHECK: {{moveq|movweq}} {{r[0-9]+}}, #1
; CHECK: dmb ish
%res = cmpxchg i16* %addr, i16 %desired, i16 %new seq_cst monotonic
diff --git a/test/CodeGen/ARM/global-merge-dllexport.ll b/test/CodeGen/ARM/global-merge-dllexport.ll
new file mode 100644
index 000000000000..680f57d0a17b
--- /dev/null
+++ b/test/CodeGen/ARM/global-merge-dllexport.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=thumbv7-win32 -arm-global-merge | FileCheck %s
+
+@x = global i32 0, align 4
+@y = dllexport global i32 0, align 4
+
+define void @f1(i32 %a1, i32 %a2) {
+; CHECK: f1:
+; CHECK: movw [[REG1:r[0-9]+]], :lower16:x
+; CHECK: movt [[REG1]], :upper16:x
+ store i32 %a1, i32* @x, align 4
+ store i32 %a2, i32* @y, align 4
+ ret void
+}
+
+; CHECK-NOT: .L_MergedGlobals
diff --git a/test/CodeGen/ARM/global-merge-external.ll b/test/CodeGen/ARM/global-merge-external.ll
index 03c977614320..25bbd0869581 100644
--- a/test/CodeGen/ARM/global-merge-external.ll
+++ b/test/CodeGen/ARM/global-merge-external.ll
@@ -1,8 +1,9 @@
-; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge | FileCheck %s --check-prefix=CHECK-MERGE
-; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge -global-merge-on-external=true | FileCheck %s --check-prefix=CHECK-MERGE
-; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge -global-merge-on-external=false | FileCheck %s --check-prefix=CHECK-NO-MERGE
-; RUN: llc < %s -mtriple=arm-macho -arm-global-merge | FileCheck %s --check-prefix=CHECK-NO-MERGE
-; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge -relocation-model=pic | FileCheck %s --check-prefix=CHECK-NO-MERGE
+; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge | FileCheck %s --check-prefixes=CHECK,CHECK-MERGE
+; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge -global-merge-on-external=true | FileCheck %s --check-prefixes=CHECK,CHECK-MERGE
+; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge -global-merge-on-external=false | FileCheck %s --check-prefixes=CHECK,CHECK-NO-MERGE
+; RUN: llc < %s -mtriple=arm-macho -arm-global-merge | FileCheck %s --check-prefixes=CHECK,CHECK-NO-MERGE
+; RUN: llc < %s -mtriple=arm-eabi -arm-global-merge -relocation-model=pic | FileCheck %s --check-prefixes=CHECK,CHECK-NO-MERGE
+; RUN: llc < %s -mtriple=thumbv7-win32 -arm-global-merge | FileCheck %s --check-prefixes=CHECK-WIN32
@x = global i32 0, align 4
@y = global i32 0, align 4
@@ -10,10 +11,13 @@
define void @f1(i32 %a1, i32 %a2) {
;CHECK: f1:
-;CHECK: ldr {{r[0-9]+}}, [[LABEL1:\.LCPI[0-9]+_[0-9]]]
+;CHECK: ldr {{r[0-9]+}}, [[LABEL1:\.?LCPI[0-9]+_[0-9]]]
;CHECK: [[LABEL1]]:
;CHECK-MERGE: .long .L_MergedGlobals
;CHECK-NO-MERGE: .long {{_?x}}
+;CHECK-WIN32: f1:
+;CHECK-WIN32: movw [[REG1:r[0-9]+]], :lower16:.L_MergedGlobals
+;CHECK-WIN32: movt [[REG1]], :upper16:.L_MergedGlobals
store i32 %a1, i32* @x, align 4
store i32 %a2, i32* @y, align 4
ret void
@@ -21,10 +25,13 @@ define void @f1(i32 %a1, i32 %a2) {
define void @g1(i32 %a1, i32 %a2) {
;CHECK: g1:
-;CHECK: ldr {{r[0-9]+}}, [[LABEL2:\.LCPI[0-9]+_[0-9]]]
+;CHECK: ldr {{r[0-9]+}}, [[LABEL2:\.?LCPI[0-9]+_[0-9]]]
;CHECK: [[LABEL2]]:
;CHECK-MERGE: .long .L_MergedGlobals
;CHECK-NO-MERGE: .long {{_?y}}
+;CHECK-WIN32: g1:
+;CHECK-WIN32: movw [[REG2:r[0-9]+]], :lower16:.L_MergedGlobals
+;CHECK-WIN32: movt [[REG2]], :upper16:.L_MergedGlobals
store i32 %a1, i32* @y, align 4
store i32 %a2, i32* @z, align 4
ret void
@@ -35,6 +42,7 @@ define void @g1(i32 %a1, i32 %a2) {
;CHECK-MERGE: .type .L_MergedGlobals,%object
;CHECK-MERGE: .local .L_MergedGlobals
;CHECK-MERGE: .comm .L_MergedGlobals,12,4
+;CHECK-WIN32: .lcomm .L_MergedGlobals,12,4
;CHECK-MERGE: .globl x
;CHECK-MERGE: x = .L_MergedGlobals
@@ -45,3 +53,10 @@ define void @g1(i32 %a1, i32 %a2) {
;CHECK-MERGE: .globl z
;CHECK-MERGE: z = .L_MergedGlobals+8
;CHECK-MERGE: .size z, 4
+
+;CHECK-WIN32: .globl x
+;CHECK-WIN32: x = .L_MergedGlobals
+;CHECK-WIN32: .globl y
+;CHECK-WIN32: y = .L_MergedGlobals+4
+;CHECK-WIN32: .globl z
+;CHECK-WIN32: z = .L_MergedGlobals+8
diff --git a/test/CodeGen/ARM/peephole-phi.mir b/test/CodeGen/ARM/peephole-phi.mir
new file mode 100644
index 000000000000..30343654dea1
--- /dev/null
+++ b/test/CodeGen/ARM/peephole-phi.mir
@@ -0,0 +1,67 @@
+# RUN: llc -o - %s -mtriple=armv7-- -verify-machineinstrs -run-pass=peephole-opt | FileCheck %s
+#
+# Make sure we do not crash on this input.
+# Note that this input could in principle be optimized, but right now we don't
+# have this case implemented so the output should simply be unchanged.
+#
+# CHECK-LABEL: name: func
+# CHECK: body: |
+# CHECK: bb.0:
+# CHECK: Bcc %bb.2, 1, undef %cpsr
+#
+# CHECK: bb.1:
+# CHECK: %0:dpr = IMPLICIT_DEF
+# CHECK: %1:gpr, %2:gpr = VMOVRRD %0, 14, %noreg
+# CHECK: B %bb.3
+#
+# CHECK: bb.2:
+# CHECK: %3:spr = IMPLICIT_DEF
+# CHECK: %4:gpr = VMOVRS %3, 14, %noreg
+#
+# CHECK: bb.3:
+# CHECK: %5:gpr = PHI %1, %bb.1, %4, %bb.2
+# CHECK: %6:spr = VMOVSR %5, 14, %noreg
+---
+name: func0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ Bcc %bb.2, 1, undef %cpsr
+
+ bb.1:
+ %0:dpr = IMPLICIT_DEF
+ %1:gpr, %2:gpr = VMOVRRD %0:dpr, 14, %noreg
+ B %bb.3
+
+ bb.2:
+ %3:spr = IMPLICIT_DEF
+ %4:gpr = VMOVRS %3:spr, 14, %noreg
+
+ bb.3:
+ %5:gpr = PHI %1, %bb.1, %4, %bb.2
+ %6:spr = VMOVSR %5, 14, %noreg
+...
+
+# CHECK-LABEL: name: func1
+# CHECK: %6:spr = PHI %0, %bb.1, %2, %bb.2
+# CHEKC: %7:spr = COPY %6
+---
+name: func1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ Bcc %bb.2, 1, undef %cpsr
+
+ bb.1:
+ %1:spr = IMPLICIT_DEF
+ %0:gpr = VMOVRS %1, 14, %noreg
+ B %bb.3
+
+ bb.2:
+ %3:spr = IMPLICIT_DEF
+ %2:gpr = VMOVRS %3:spr, 14, %noreg
+
+ bb.3:
+ %4:gpr = PHI %0, %bb.1, %2, %bb.2
+ %5:spr = VMOVSR %4, 14, %noreg
+...
diff --git a/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll b/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
new file mode 100644
index 000000000000..093899690d07
--- /dev/null
+++ b/test/CodeGen/PowerPC/PR35812-neg-cmpxchg.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Make sure that a negative value for the compare-and-swap is zero extended
+; from i8/i16 to i32 since it will be compared for equality.
+; RUN: llc -mtriple=powerpc64le-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64le-linux-gnu -mcpu=pwr7 < %s | FileCheck %s --check-prefix=CHECK-P7
+
+@str = private unnamed_addr constant [46 x i8] c"FAILED: __atomic_compare_exchange_n() failed.\00"
+@str.1 = private unnamed_addr constant [59 x i8] c"FAILED: __atomic_compare_exchange_n() set the wrong value.\00"
+@str.2 = private unnamed_addr constant [7 x i8] c"PASSED\00"
+
+define signext i32 @main() {
+; CHECK-LABEL: main:
+; CHECK: li 3, -32477
+; CHECK: lis 12, 0
+; CHECK: li 6, 234
+; CHECK: sth 3, 46(1)
+; CHECK: ori 4, 12, 33059
+; CHECK: sync
+; CHECK: .LBB0_1: # %L.entry
+; CHECK: lharx 3, 0, 5
+; CHECK: cmpw 4, 3
+; CHECK: bne 0, .LBB0_3
+; CHECK: sthcx. 6, 0, 5
+; CHECK: bne 0, .LBB0_1
+; CHECK: b .LBB0_4
+; CHECK: .LBB0_3: # %L.entry
+; CHECK: sthcx. 3, 0, 5
+; CHECK: .LBB0_4: # %L.entry
+; CHECK: cmplwi 3, 33059
+; CHECK: lwsync
+; CHECK: lhz 3, 46(1)
+; CHECK: cmplwi 3, 234
+;
+; CHECK-P7-LABEL: main:
+; CHECK-P7: lis 4, 0
+; CHECK-P7: li 7, 0
+; CHECK-P7: li 3, -32477
+; CHECK-P7: sth 3, 46(1)
+; CHECK-P7: li 5, 234
+; CHECK-P7: ori 4, 4, 33059
+; CHECK-P7: rlwinm 3, 6, 3, 27, 27
+; CHECK-P7: ori 7, 7, 65535
+; CHECK-P7: sync
+; CHECK-P7: slw 8, 5, 3
+; CHECK-P7: slw 5, 7, 3
+; CHECK-P7: slw 9, 4, 3
+; CHECK-P7: and 7, 8, 5
+; CHECK-P7: rldicr 4, 6, 0, 61
+; CHECK-P7: and 8, 9, 5
+; CHECK-P7: .LBB0_1: # %L.entry
+; CHECK-P7: lwarx 9, 0, 4
+; CHECK-P7: and 6, 9, 5
+; CHECK-P7: cmpw 0, 6, 8
+; CHECK-P7: bne 0, .LBB0_3
+; CHECK-P7: andc 9, 9, 5
+; CHECK-P7: or 9, 9, 7
+; CHECK-P7: stwcx. 9, 0, 4
+; CHECK-P7: bne 0, .LBB0_1
+; CHECK-P7: b .LBB0_4
+; CHECK-P7: .LBB0_3: # %L.entry
+; CHECK-P7: stwcx. 9, 0, 4
+; CHECK-P7: .LBB0_4: # %L.entry
+; CHECK-P7: srw 3, 6, 3
+; CHECK-P7: lwsync
+; CHECK-P7: cmplwi 3, 33059
+; CHECK-P7: lhz 3, 46(1)
+; CHECK-P7: cmplwi 3, 234
+L.entry:
+ %value.addr = alloca i16, align 2
+ store i16 -32477, i16* %value.addr, align 2
+ %0 = cmpxchg i16* %value.addr, i16 -32477, i16 234 seq_cst seq_cst
+ %1 = extractvalue { i16, i1 } %0, 1
+ br i1 %1, label %L.B0000, label %L.B0003
+
+L.B0003: ; preds = %L.entry
+ %puts = call i32 @puts(i8* getelementptr inbounds ([46 x i8], [46 x i8]* @str, i64 0, i64 0))
+ ret i32 1
+
+L.B0000: ; preds = %L.entry
+ %2 = load i16, i16* %value.addr, align 2
+ %3 = icmp eq i16 %2, 234
+ br i1 %3, label %L.B0001, label %L.B0005
+
+L.B0005: ; preds = %L.B0000
+ %puts1 = call i32 @puts(i8* getelementptr inbounds ([59 x i8], [59 x i8]* @str.1, i64 0, i64 0))
+ ret i32 1
+
+L.B0001: ; preds = %L.B0000
+ %puts2 = call i32 @puts(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @str.2, i64 0, i64 0))
+ ret i32 0
+}
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) #0
diff --git a/test/CodeGen/PowerPC/atomics-regression.ll b/test/CodeGen/PowerPC/atomics-regression.ll
index 7079f6dd52e9..daf55fc426d0 100644
--- a/test/CodeGen/PowerPC/atomics-regression.ll
+++ b/test/CodeGen/PowerPC/atomics-regression.ll
@@ -404,6 +404,7 @@ define void @test39() {
define void @test40(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test40:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: b .LBB40_2
; PPC64LE-NEXT: .p2align 5
; PPC64LE-NEXT: .LBB40_1:
@@ -423,6 +424,7 @@ define void @test40(i8* %ptr, i8 %cmp, i8 %val) {
define void @test41(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test41:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: .LBB41_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
; PPC64LE-NEXT: cmpw 4, 6
@@ -444,6 +446,7 @@ define void @test41(i8* %ptr, i8 %cmp, i8 %val) {
define void @test42(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test42:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: .LBB42_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
; PPC64LE-NEXT: cmpw 4, 6
@@ -465,6 +468,7 @@ define void @test42(i8* %ptr, i8 %cmp, i8 %val) {
define void @test43(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test43:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: b .LBB43_2
; PPC64LE-NEXT: .p2align 5
@@ -485,6 +489,7 @@ define void @test43(i8* %ptr, i8 %cmp, i8 %val) {
define void @test44(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test44:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: b .LBB44_2
; PPC64LE-NEXT: .p2align 5
@@ -505,6 +510,7 @@ define void @test44(i8* %ptr, i8 %cmp, i8 %val) {
define void @test45(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test45:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: .LBB45_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
@@ -527,6 +533,7 @@ define void @test45(i8* %ptr, i8 %cmp, i8 %val) {
define void @test46(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test46:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: .LBB46_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
@@ -549,6 +556,7 @@ define void @test46(i8* %ptr, i8 %cmp, i8 %val) {
define void @test47(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test47:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: .LBB47_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
@@ -571,6 +579,7 @@ define void @test47(i8* %ptr, i8 %cmp, i8 %val) {
define void @test48(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test48:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: .LBB48_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
@@ -593,6 +602,7 @@ define void @test48(i8* %ptr, i8 %cmp, i8 %val) {
define void @test49(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test49:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: .LBB49_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
@@ -615,6 +625,7 @@ define void @test49(i8* %ptr, i8 %cmp, i8 %val) {
define void @test50(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test50:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: b .LBB50_2
; PPC64LE-NEXT: .p2align 5
; PPC64LE-NEXT: .LBB50_1:
@@ -634,6 +645,7 @@ define void @test50(i16* %ptr, i16 %cmp, i16 %val) {
define void @test51(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test51:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: .LBB51_1:
; PPC64LE-NEXT: lharx 6, 0, 3
; PPC64LE-NEXT: cmpw 4, 6
@@ -655,6 +667,7 @@ define void @test51(i16* %ptr, i16 %cmp, i16 %val) {
define void @test52(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test52:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: .LBB52_1:
; PPC64LE-NEXT: lharx 6, 0, 3
; PPC64LE-NEXT: cmpw 4, 6
@@ -676,6 +689,7 @@ define void @test52(i16* %ptr, i16 %cmp, i16 %val) {
define void @test53(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test53:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: b .LBB53_2
; PPC64LE-NEXT: .p2align 5
@@ -696,6 +710,7 @@ define void @test53(i16* %ptr, i16 %cmp, i16 %val) {
define void @test54(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test54:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: b .LBB54_2
; PPC64LE-NEXT: .p2align 5
@@ -716,6 +731,7 @@ define void @test54(i16* %ptr, i16 %cmp, i16 %val) {
define void @test55(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test55:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: .LBB55_1:
; PPC64LE-NEXT: lharx 6, 0, 3
@@ -738,6 +754,7 @@ define void @test55(i16* %ptr, i16 %cmp, i16 %val) {
define void @test56(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test56:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: .LBB56_1:
; PPC64LE-NEXT: lharx 6, 0, 3
@@ -760,6 +777,7 @@ define void @test56(i16* %ptr, i16 %cmp, i16 %val) {
define void @test57(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test57:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: .LBB57_1:
; PPC64LE-NEXT: lharx 6, 0, 3
@@ -782,6 +800,7 @@ define void @test57(i16* %ptr, i16 %cmp, i16 %val) {
define void @test58(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test58:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: .LBB58_1:
; PPC64LE-NEXT: lharx 6, 0, 3
@@ -804,6 +823,7 @@ define void @test58(i16* %ptr, i16 %cmp, i16 %val) {
define void @test59(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test59:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: .LBB59_1:
; PPC64LE-NEXT: lharx 6, 0, 3
@@ -1248,6 +1268,7 @@ define void @test79(i64* %ptr, i64 %cmp, i64 %val) {
define void @test80(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test80:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: b .LBB80_2
; PPC64LE-NEXT: .p2align 5
; PPC64LE-NEXT: .LBB80_1:
@@ -1267,6 +1288,7 @@ define void @test80(i8* %ptr, i8 %cmp, i8 %val) {
define void @test81(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test81:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: .LBB81_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
; PPC64LE-NEXT: cmpw 4, 6
@@ -1288,6 +1310,7 @@ define void @test81(i8* %ptr, i8 %cmp, i8 %val) {
define void @test82(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test82:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: .LBB82_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
; PPC64LE-NEXT: cmpw 4, 6
@@ -1309,6 +1332,7 @@ define void @test82(i8* %ptr, i8 %cmp, i8 %val) {
define void @test83(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test83:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: b .LBB83_2
; PPC64LE-NEXT: .p2align 5
@@ -1329,6 +1353,7 @@ define void @test83(i8* %ptr, i8 %cmp, i8 %val) {
define void @test84(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test84:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: b .LBB84_2
; PPC64LE-NEXT: .p2align 5
@@ -1349,6 +1374,7 @@ define void @test84(i8* %ptr, i8 %cmp, i8 %val) {
define void @test85(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test85:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: .LBB85_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
@@ -1371,6 +1397,7 @@ define void @test85(i8* %ptr, i8 %cmp, i8 %val) {
define void @test86(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test86:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: .LBB86_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
@@ -1393,6 +1420,7 @@ define void @test86(i8* %ptr, i8 %cmp, i8 %val) {
define void @test87(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test87:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: .LBB87_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
@@ -1415,6 +1443,7 @@ define void @test87(i8* %ptr, i8 %cmp, i8 %val) {
define void @test88(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test88:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: .LBB88_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
@@ -1437,6 +1466,7 @@ define void @test88(i8* %ptr, i8 %cmp, i8 %val) {
define void @test89(i8* %ptr, i8 %cmp, i8 %val) {
; PPC64LE-LABEL: test89:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 24, 31
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: .LBB89_1:
; PPC64LE-NEXT: lbarx 6, 0, 3
@@ -1459,6 +1489,7 @@ define void @test89(i8* %ptr, i8 %cmp, i8 %val) {
define void @test90(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test90:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: b .LBB90_2
; PPC64LE-NEXT: .p2align 5
; PPC64LE-NEXT: .LBB90_1:
@@ -1478,6 +1509,7 @@ define void @test90(i16* %ptr, i16 %cmp, i16 %val) {
define void @test91(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test91:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: .LBB91_1:
; PPC64LE-NEXT: lharx 6, 0, 3
; PPC64LE-NEXT: cmpw 4, 6
@@ -1499,6 +1531,7 @@ define void @test91(i16* %ptr, i16 %cmp, i16 %val) {
define void @test92(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test92:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: .LBB92_1:
; PPC64LE-NEXT: lharx 6, 0, 3
; PPC64LE-NEXT: cmpw 4, 6
@@ -1520,6 +1553,7 @@ define void @test92(i16* %ptr, i16 %cmp, i16 %val) {
define void @test93(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test93:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: b .LBB93_2
; PPC64LE-NEXT: .p2align 5
@@ -1540,6 +1574,7 @@ define void @test93(i16* %ptr, i16 %cmp, i16 %val) {
define void @test94(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test94:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: b .LBB94_2
; PPC64LE-NEXT: .p2align 5
@@ -1560,6 +1595,7 @@ define void @test94(i16* %ptr, i16 %cmp, i16 %val) {
define void @test95(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test95:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: .LBB95_1:
; PPC64LE-NEXT: lharx 6, 0, 3
@@ -1582,6 +1618,7 @@ define void @test95(i16* %ptr, i16 %cmp, i16 %val) {
define void @test96(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test96:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: lwsync
; PPC64LE-NEXT: .LBB96_1:
; PPC64LE-NEXT: lharx 6, 0, 3
@@ -1604,6 +1641,7 @@ define void @test96(i16* %ptr, i16 %cmp, i16 %val) {
define void @test97(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test97:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: .LBB97_1:
; PPC64LE-NEXT: lharx 6, 0, 3
@@ -1626,6 +1664,7 @@ define void @test97(i16* %ptr, i16 %cmp, i16 %val) {
define void @test98(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test98:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: .LBB98_1:
; PPC64LE-NEXT: lharx 6, 0, 3
@@ -1648,6 +1687,7 @@ define void @test98(i16* %ptr, i16 %cmp, i16 %val) {
define void @test99(i16* %ptr, i16 %cmp, i16 %val) {
; PPC64LE-LABEL: test99:
; PPC64LE: # %bb.0:
+; PPC64LE-NEXT: rlwinm 4, 4, 0, 16, 31
; PPC64LE-NEXT: sync
; PPC64LE-NEXT: .LBB99_1:
; PPC64LE-NEXT: lharx 6, 0, 3
diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 333efb04913d..1a483355319f 100644
--- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -4780,3 +4780,42 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x doub
ret <2 x double> %res
}
+; PR35977
+define void @test_zext_v8i8_to_v8i16(<8 x i8>* %arg, <8 x i16>* %arg1) {
+; CHECK-LABEL: test_zext_v8i8_to_v8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT: vpsllw $8, %xmm0, %xmm0
+; CHECK-NEXT: vmovdqa %xmm0, (%rsi)
+; CHECK-NEXT: retq
+ %tmp = getelementptr <8 x i8>, <8 x i8>* %arg, i32 0
+ %tmp2 = load <8 x i8>, <8 x i8>* %tmp
+ %tmp3 = extractelement <8 x i8> %tmp2, i32 0
+ %tmp4 = zext i8 %tmp3 to i16
+ %tmp5 = insertelement <8 x i16> undef, i16 %tmp4, i32 0
+ %tmp6 = extractelement <8 x i8> %tmp2, i32 1
+ %tmp7 = zext i8 %tmp6 to i16
+ %tmp8 = insertelement <8 x i16> %tmp5, i16 %tmp7, i32 1
+ %tmp9 = extractelement <8 x i8> %tmp2, i32 2
+ %tmp10 = zext i8 %tmp9 to i16
+ %tmp11 = insertelement <8 x i16> %tmp8, i16 %tmp10, i32 2
+ %tmp12 = extractelement <8 x i8> %tmp2, i32 3
+ %tmp13 = zext i8 %tmp12 to i16
+ %tmp14 = insertelement <8 x i16> %tmp11, i16 %tmp13, i32 3
+ %tmp15 = extractelement <8 x i8> %tmp2, i32 4
+ %tmp16 = zext i8 %tmp15 to i16
+ %tmp17 = insertelement <8 x i16> %tmp14, i16 %tmp16, i32 4
+ %tmp18 = extractelement <8 x i8> %tmp2, i32 5
+ %tmp19 = zext i8 %tmp18 to i16
+ %tmp20 = insertelement <8 x i16> %tmp17, i16 %tmp19, i32 5
+ %tmp21 = extractelement <8 x i8> %tmp2, i32 6
+ %tmp22 = zext i8 %tmp21 to i16
+ %tmp23 = insertelement <8 x i16> %tmp20, i16 %tmp22, i32 6
+ %tmp24 = extractelement <8 x i8> %tmp2, i32 7
+ %tmp25 = zext i8 %tmp24 to i16
+ %tmp26 = insertelement <8 x i16> %tmp23, i16 %tmp25, i32 7
+ %tmp27 = shl <8 x i16> %tmp26, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %tmp28 = getelementptr <8 x i16>, <8 x i16>* %arg1, i32 0
+ store <8 x i16> %tmp27, <8 x i16>* %tmp28
+ ret void
+}
diff --git a/test/CodeGen/X86/darwin-bzero.ll b/test/CodeGen/X86/darwin-bzero.ll
index 410d67ff0ec1..3d03ec677a01 100644
--- a/test/CodeGen/X86/darwin-bzero.ll
+++ b/test/CodeGen/X86/darwin-bzero.ll
@@ -1,10 +1,13 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck -check-prefixes=CHECK,BZERO %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck -check-prefixes=CHECK,BZERO %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck -check-prefixes=CHECK,NOBZERO %s
+; RUN: llc < %s -mtriple=x86_64-apple-ios10.0-simulator | FileCheck -check-prefixes=CHECK,NOBZERO %s
declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
; CHECK-LABEL: foo:
-; CHECK: {{calll|callq}} ___bzero
+; BZERO: {{calll|callq}} ___bzero
+; NOBZERO-NOT: bzero
define void @foo(i8* %p, i32 %len) {
call void @llvm.memset.p0i8.i32(i8* %p, i8 0, i32 %len, i32 1, i1 false)
ret void
diff --git a/test/CodeGen/X86/inline-asm-A-constraint.ll b/test/CodeGen/X86/inline-asm-A-constraint.ll
index 7975b318eff5..2ad011e88e0d 100644
--- a/test/CodeGen/X86/inline-asm-A-constraint.ll
+++ b/test/CodeGen/X86/inline-asm-A-constraint.ll
@@ -19,7 +19,8 @@ entry:
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1
ret { i64, i64 } %.fca.1.insert
}
-; CHECK: lock cmpxchg16b
+; CHECK: lock
+; CHECK-NEXT: cmpxchg16b
attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }
diff --git a/test/CodeGen/X86/pr35761.ll b/test/CodeGen/X86/pr35761.ll
new file mode 100644
index 000000000000..0bf81bff841f
--- /dev/null
+++ b/test/CodeGen/X86/pr35761.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux %s -o - | FileCheck %s
+
+@x = global i8 0, align 1
+@y = global i32 0, align 4
+@z = global i24 0, align 4
+
+define void @PR35761(i32 %call) {
+; CHECK-LABEL: PR35761:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movzbl {{.*}}(%rip), %eax
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: movzbl {{.*}}(%rip), %ecx
+; CHECK-NEXT: xorl $255, %ecx
+; CHECK-NEXT: orl %eax, %ecx
+; CHECK-NEXT: movw %cx, {{.*}}(%rip)
+; CHECK-NEXT: movb $0, z+{{.*}}(%rip)
+; CHECK-NEXT: retq
+entry:
+ %0 = load i8, i8* @x, align 1
+ %tobool = trunc i8 %0 to i1
+ %conv = zext i1 %tobool to i32
+ %or = or i32 32767, %call
+ %neg = xor i32 %or, -1
+ %neg1 = xor i32 %neg, -1
+ %1 = load i32, i32* @y, align 4
+ %xor = xor i32 %neg1, %1
+ %or2 = or i32 %conv, %xor
+ %conv3 = trunc i32 %or2 to i8
+ %bf.load = load i24, i24* @z, align 4
+ %2 = zext i8 %conv3 to i24
+ %bf.value = and i24 %2, 4194303
+ store i24 %bf.value, i24* @z, align 2
+ ret void
+}
+
diff --git a/test/CodeGen/X86/pr35972.ll b/test/CodeGen/X86/pr35972.ll
new file mode 100644
index 000000000000..09363fbc89bb
--- /dev/null
+++ b/test/CodeGen/X86/pr35972.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i686-unknown-linux-gnu %s -o - -mattr=avx512bw | FileCheck %s
+
+define void @test3(i32 %c, <64 x i1>* %ptr) {
+; CHECK-LABEL: test3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT: sbbl %ecx, %ecx
+; CHECK-NEXT: kmovd %ecx, %k0
+; CHECK-NEXT: kunpckdq %k0, %k0, %k0
+; CHECK-NEXT: kmovq %k0, (%eax)
+; CHECK-NEXT: retl
+ %cmp = icmp eq i32 %c, 0
+ %insert = insertelement <64 x i1> undef, i1 %cmp, i32 0
+ %shuf = shufflevector <64 x i1> %insert, <64 x i1> undef, <64 x i32> zeroinitializer
+ store <64 x i1> %shuf, <64 x i1>* %ptr
+ ret void
+}
+
diff --git a/test/CodeGen/X86/pr37563.ll b/test/CodeGen/X86/pr37563.ll
new file mode 100644
index 000000000000..934902d8e0d0
--- /dev/null
+++ b/test/CodeGen/X86/pr37563.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - | FileCheck %s
+
+%struct.S = type <{ i16, i24, [5 x i8], i8, i16, [2 x i8] }>
+
+@z = global { i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] } { i16 -724, i8 94, i8 -18, i8 5, i8 undef, i8 96, i8 104, i8 -24, i8 10, i8 0, [5 x i8] undef }, align 8
+@tf_3_var_136 = global i64 0, align 8
+@.str = private unnamed_addr constant [6 x i8] c"%llu\0A\00", align 1
+
+define void @PR35763() {
+; CHECK-LABEL: PR35763:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movzwl {{.*}}(%rip), %eax
+; CHECK-NEXT: movzwl z+{{.*}}(%rip), %ecx
+; CHECK-NEXT: orl %eax, %ecx
+; CHECK-NEXT: movq %rcx, {{.*}}(%rip)
+; CHECK-NEXT: movl z+{{.*}}(%rip), %eax
+; CHECK-NEXT: movzbl z+{{.*}}(%rip), %ecx
+; CHECK-NEXT: shlq $32, %rcx
+; CHECK-NEXT: orq %rax, %rcx
+; CHECK-NEXT: movabsq $1090921758719, %rax # imm = 0xFE0000FFFF
+; CHECK-NEXT: andq %rcx, %rax
+; CHECK-NEXT: movl %eax, z+{{.*}}(%rip)
+; CHECK-NEXT: shrq $32, %rax
+; CHECK-NEXT: movb %al, z+{{.*}}(%rip)
+; CHECK-NEXT: retq
+entry:
+ %0 = load i16, i16* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 0), align 8
+ %conv = sext i16 %0 to i32
+ %bf.load = load i32, i32* bitcast (i24* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 1) to i32*), align 2
+ %bf.clear = and i32 %bf.load, 2097151
+ %bf.cast = zext i32 %bf.clear to i64
+ %conv1 = trunc i64 %bf.cast to i32
+ %or = or i32 %conv, %conv1
+ %conv2 = trunc i32 %or to i16
+ %conv3 = zext i16 %conv2 to i64
+ store i64 %conv3, i64* @tf_3_var_136, align 8
+ %bf.load4 = load i40, i40* bitcast ([5 x i8]* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 2) to i40*), align 2
+ %bf.clear5 = and i40 %bf.load4, -8589869057
+ store i40 %bf.clear5, i40* bitcast ([5 x i8]* getelementptr inbounds (%struct.S, %struct.S* bitcast ({ i16, i8, i8, i8, i8, i8, i8, i8, i8, i8, [5 x i8] }* @z to %struct.S*), i32 0, i32 2) to i40*), align 2
+ ret void
+}
diff --git a/test/CodeGen/X86/var-permute-128.ll b/test/CodeGen/X86/var-permute-128.ll
index fb5f02e8d5d2..ba78cf7ee180 100644
--- a/test/CodeGen/X86/var-permute-128.ll
+++ b/test/CodeGen/X86/var-permute-128.ll
@@ -207,13 +207,12 @@ define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
; SSSE3-LABEL: var_shuffle_v16i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb %xmm0, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: var_shuffle_v16i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%index0 = extractelement <16 x i8> %indices, i32 0
%index1 = extractelement <16 x i8> %indices, i32 1
diff --git a/test/CodeGen/X86/var-permute-256.ll b/test/CodeGen/X86/var-permute-256.ll
index 3baab2476d40..b624fb087193 100644
--- a/test/CodeGen/X86/var-permute-256.ll
+++ b/test/CodeGen/X86/var-permute-256.ll
@@ -1277,3 +1277,183 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi
%ret7 = insertelement <8 x float> %ret6, float %v7, i32 7
ret <8 x float> %ret7
}
+
+define <8 x i32> @pr35820(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind {
+; AVX1-LABEL: pr35820:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpextrq $1, %xmm1, %r8
+; AVX1-NEXT: movq %r8, %r10
+; AVX1-NEXT: shrq $30, %r10
+; AVX1-NEXT: vmovq %xmm1, %r9
+; AVX1-NEXT: movq %r9, %rsi
+; AVX1-NEXT: shrq $30, %rsi
+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: andl $3, %r9d
+; AVX1-NEXT: andl $12, %esi
+; AVX1-NEXT: andl $3, %r8d
+; AVX1-NEXT: andl $12, %r10d
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: movq %rax, %rdi
+; AVX1-NEXT: shrq $30, %rdi
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rdx
+; AVX1-NEXT: shrq $30, %rdx
+; AVX1-NEXT: andl $3, %ecx
+; AVX1-NEXT: andl $12, %edx
+; AVX1-NEXT: andl $3, %eax
+; AVX1-NEXT: andl $12, %edi
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rdx), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rax,4), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrd $2, -24(%rsp,%r8,4), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrd $3, -24(%rsp,%r10), %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; INT256-LABEL: pr35820:
+; INT256: # %bb.0: # %entry
+; INT256-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; INT256-NEXT: retq
+entry:
+ %tmp1 = extractelement <8 x i32> %indices, i32 0
+ %vecext2.8 = extractelement <4 x i32> %v, i32 %tmp1
+ %tmp2 = extractelement <8 x i32> %indices, i32 1
+ %vecext2.9 = extractelement <4 x i32> %v, i32 %tmp2
+ %tmp3 = extractelement <8 x i32> %indices, i32 2
+ %vecext2.10 = extractelement <4 x i32> %v, i32 %tmp3
+ %tmp4 = extractelement <8 x i32> %indices, i32 3
+ %vecext2.11 = extractelement <4 x i32> %v, i32 %tmp4
+ %tmp5 = extractelement <8 x i32> %indices, i32 4
+ %vecext2.12 = extractelement <4 x i32> %v, i32 %tmp5
+ %tmp6 = extractelement <8 x i32> %indices, i32 5
+ %vecext2.13 = extractelement <4 x i32> %v, i32 %tmp6
+ %tmp7 = extractelement <8 x i32> %indices, i32 6
+ %vecext2.14 = extractelement <4 x i32> %v, i32 %tmp7
+ %tmp8 = extractelement <8 x i32> %indices, i32 7
+ %vecext2.15 = extractelement <4 x i32> %v, i32 %tmp8
+ %tmp9 = insertelement <8 x i32> undef, i32 %vecext2.8, i32 0
+ %tmp10 = insertelement <8 x i32> %tmp9, i32 %vecext2.9, i32 1
+ %tmp11 = insertelement <8 x i32> %tmp10, i32 %vecext2.10, i32 2
+ %tmp12 = insertelement <8 x i32> %tmp11, i32 %vecext2.11, i32 3
+ %tmp13 = insertelement <8 x i32> %tmp12, i32 %vecext2.12, i32 4
+ %tmp14 = insertelement <8 x i32> %tmp13, i32 %vecext2.13, i32 5
+ %tmp15 = insertelement <8 x i32> %tmp14, i32 %vecext2.14, i32 6
+ %tmp16 = insertelement <8 x i32> %tmp15, i32 %vecext2.15, i32 7
+ ret <8 x i32> %tmp16
+}
+
+define <8 x float> @pr35820_float(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind {
+; AVX1-LABEL: pr35820_float:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpextrq $1, %xmm1, %r8
+; AVX1-NEXT: movq %r8, %r10
+; AVX1-NEXT: shrq $30, %r10
+; AVX1-NEXT: vmovq %xmm1, %r9
+; AVX1-NEXT: movq %r9, %rdx
+; AVX1-NEXT: shrq $30, %rdx
+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: andl $3, %r9d
+; AVX1-NEXT: andl $12, %edx
+; AVX1-NEXT: andl $3, %r8d
+; AVX1-NEXT: andl $12, %r10d
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: movq %rax, %rdi
+; AVX1-NEXT: shrq $30, %rdi
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: movq %rcx, %rsi
+; AVX1-NEXT: shrq $30, %rsi
+; AVX1-NEXT: andl $3, %ecx
+; AVX1-NEXT: andl $12, %esi
+; AVX1-NEXT: andl $3, %eax
+; AVX1-NEXT: andl $12, %edi
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; INT256-LABEL: pr35820_float:
+; INT256: # %bb.0: # %entry
+; INT256-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; INT256-NEXT: retq
+entry:
+ %tmp1 = extractelement <8 x i32> %indices, i32 0
+ %vecext2.8 = extractelement <4 x float> %v, i32 %tmp1
+ %tmp2 = extractelement <8 x i32> %indices, i32 1
+ %vecext2.9 = extractelement <4 x float> %v, i32 %tmp2
+ %tmp3 = extractelement <8 x i32> %indices, i32 2
+ %vecext2.10 = extractelement <4 x float> %v, i32 %tmp3
+ %tmp4 = extractelement <8 x i32> %indices, i32 3
+ %vecext2.11 = extractelement <4 x float> %v, i32 %tmp4
+ %tmp5 = extractelement <8 x i32> %indices, i32 4
+ %vecext2.12 = extractelement <4 x float> %v, i32 %tmp5
+ %tmp6 = extractelement <8 x i32> %indices, i32 5
+ %vecext2.13 = extractelement <4 x float> %v, i32 %tmp6
+ %tmp7 = extractelement <8 x i32> %indices, i32 6
+ %vecext2.14 = extractelement <4 x float> %v, i32 %tmp7
+ %tmp8 = extractelement <8 x i32> %indices, i32 7
+ %vecext2.15 = extractelement <4 x float> %v, i32 %tmp8
+ %tmp9 = insertelement <8 x float> undef, float %vecext2.8, i32 0
+ %tmp10 = insertelement <8 x float> %tmp9, float %vecext2.9, i32 1
+ %tmp11 = insertelement <8 x float> %tmp10, float %vecext2.10, i32 2
+ %tmp12 = insertelement <8 x float> %tmp11, float %vecext2.11, i32 3
+ %tmp13 = insertelement <8 x float> %tmp12, float %vecext2.12, i32 4
+ %tmp14 = insertelement <8 x float> %tmp13, float %vecext2.13, i32 5
+ %tmp15 = insertelement <8 x float> %tmp14, float %vecext2.14, i32 6
+ %tmp16 = insertelement <8 x float> %tmp15, float %vecext2.15, i32 7
+ ret <8 x float> %tmp16
+}
+
+define <4 x i32> @big_source(<8 x i32> %v, <4 x i32> %indices) unnamed_addr nounwind {
+; AVX-LABEL: big_source:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: movq %rsp, %rbp
+; AVX-NEXT: andq $-32, %rsp
+; AVX-NEXT: subq $64, %rsp
+; AVX-NEXT: vmovq %xmm1, %rax
+; AVX-NEXT: movq %rax, %rcx
+; AVX-NEXT: shrq $30, %rcx
+; AVX-NEXT: andl $28, %ecx
+; AVX-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX-NEXT: movq %rdx, %rsi
+; AVX-NEXT: sarq $32, %rsi
+; AVX-NEXT: andl $7, %eax
+; AVX-NEXT: andl $7, %edx
+; AVX-NEXT: vmovaps %ymm0, (%rsp)
+; AVX-NEXT: andl $7, %esi
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vpinsrd $1, (%rsp,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $2, (%rsp,%rdx,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $3, (%rsp,%rsi,4), %xmm0, %xmm0
+; AVX-NEXT: movq %rbp, %rsp
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+entry:
+ %tmp1 = extractelement <4 x i32> %indices, i32 0
+ %vecext2.8 = extractelement <8 x i32> %v, i32 %tmp1
+ %tmp2 = extractelement <4 x i32> %indices, i32 1
+ %vecext2.9 = extractelement <8 x i32> %v, i32 %tmp2
+ %tmp3 = extractelement <4 x i32> %indices, i32 2
+ %vecext2.10 = extractelement <8 x i32> %v, i32 %tmp3
+ %tmp4 = extractelement <4 x i32> %indices, i32 3
+ %vecext2.11 = extractelement <8 x i32> %v, i32 %tmp4
+ %tmp9 = insertelement <4 x i32> undef, i32 %vecext2.8, i32 0
+ %tmp10 = insertelement <4 x i32> %tmp9, i32 %vecext2.9, i32 1
+ %tmp11 = insertelement <4 x i32> %tmp10, i32 %vecext2.10, i32 2
+ %tmp12 = insertelement <4 x i32> %tmp11, i32 %vecext2.11, i32 3
+ ret <4 x i32> %tmp12
+}
diff --git a/test/MC/COFF/cv-inline-linetable.s b/test/MC/COFF/cv-inline-linetable.s
index 61a42d92f405..c5e28c4d0785 100644
--- a/test/MC/COFF/cv-inline-linetable.s
+++ b/test/MC/COFF/cv-inline-linetable.s
@@ -135,3 +135,29 @@ Ltmp1:
.cv_filechecksums # File index to string table offset subsection
.cv_stringtable # String table
+# CHECK-LABEL: FunctionLineTable [
+# CHECK: LinkageName: ?baz@@YAXXZ
+# CHECK: Flags: 0x1
+# CHECK: CodeSize: 0x3D
+# CHECK: FilenameSegment [
+# CHECK: Filename: D:\src\llvm\build\t.cpp (0x0)
+# CHECK: +0x0 [
+# CHECK: LineNumberStart: 13
+# CHECK: ]
+# CHECK: +0x1 [
+# CHECK: LineNumberStart: 14
+# CHECK: ]
+# CHECK: +0x8 [
+# CHECK: LineNumberStart: 15
+# CHECK: ]
+# There shouldn't be any other line number entries because all the other
+# .cv_locs are on line 15 where the top-level inline call site is.
+# CHECK-NOT: LineNumberStart
+# CHECK: +0x34 [
+# CHECK: LineNumberStart: 16
+# CHECK: ]
+# CHECK: +0x3B [
+# CHECK: LineNumberStart: 17
+# CHECK: ]
+# CHECK: ]
+# CHECK: ]
diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index 23846d921a8c..378af768fa99 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@@ -99,7 +99,8 @@
// CHECK: shll $2, %eax
sall $2, %eax
-// CHECK: rep movsb
+// CHECK: rep
+// CHECK-NEXT: movsb
rep # comment
movsb
@@ -1557,3 +1558,38 @@ ptwriteq 0xdeadbeef(%rbx,%rcx,8)
// CHECK: ptwriteq %rax
// CHECK: encoding: [0xf3,0x48,0x0f,0xae,0xe0]
ptwriteq %rax
+
+// __asm __volatile(
+// "pushf \n\t"
+// "popf \n\t"
+// "rep \n\t"
+// ".byte 0x0f, 0xa7, 0xd0"
+// );
+// CHECK: pushfq
+// CHECK-NEXT: popfq
+// CHECK-NEXT: rep
+// CHECK-NEXT: .byte 15
+// CHECK-NEXT: .byte 167
+// CHECK-NEXT: .byte 208
+pushfq
+popfq
+rep
+.byte 15
+.byte 167
+.byte 208
+
+// CHECK: lock
+// CHECK: cmpxchgl
+ cmp $0, %edx
+ je 1f
+ lock
+1: cmpxchgl %ecx,(%rdi)
+
+// CHECK: rep
+// CHECK-NEXT: byte
+rep
+.byte 0xa4 # movsb
+
+// CHECK: lock
+// This line has to be the last one in the file
+lock
diff --git a/test/ThinLTO/X86/Inputs/dicompositetype-unique2.ll b/test/ThinLTO/X86/Inputs/dicompositetype-unique2.ll
new file mode 100644
index 000000000000..9a9ee7223c90
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/dicompositetype-unique2.ll
@@ -0,0 +1,46 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-scei-ps4"
+
+%struct.CFVS = type { %struct.Vec }
+%struct.Vec = type { i8 }
+%struct.S = type { i8 }
+
+define void @_ZN4CFVSD2Ev(%struct.CFVS* %this) unnamed_addr align 2 !dbg !8 {
+entry:
+ %this.addr = alloca %struct.CFVS*, align 8
+ store %struct.CFVS* %this, %struct.CFVS** %this.addr, align 8
+ %this1 = load %struct.CFVS*, %struct.CFVS** %this.addr, align 8
+ %m_val = getelementptr inbounds %struct.CFVS, %struct.CFVS* %this1, i32 0, i32 0
+ ret void
+}
+
+declare dereferenceable(1) %struct.S* @_Z3Getv()
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 321360) (llvm/trunk 321359)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "bz188598-b.cpp", directory: "")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!8 = distinct !DISubprogram(name: "~CFVS", linkageName: "_ZN4CFVSD2Ev", scope: !9, file: !1, line: 2, type: !28, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !27, variables: !2)
+!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CFVS", file: !10, line: 7, size: 8, elements: !11, identifier: "_ZTS4CFVS")
+!10 = !DIFile(filename: "./bz188598.h", directory: "")
+!11 = !{!12, !27}
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "m_val", scope: !9, file: !10, line: 9, baseType: !13, size: 8)
+!13 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Vec<&Get>", file: !10, line: 4, size: 8, elements: !14, templateParams: !19, identifier: "_ZTS3VecIXadL_Z3GetvEEE")
+!14 = !{!35}
+!19 = !{!20}
+!20 = !DITemplateValueParameter(name: "F", type: !21, value: %struct.S* ()* @_Z3Getv)
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !22, size: 64)
+!22 = !DIDerivedType(tag: DW_TAG_typedef, name: "Func", file: !10, line: 2, baseType: !23)
+!23 = !DISubroutineType(types: !24)
+!24 = !{!35}
+!27 = !DISubprogram(name: "~CFVS", scope: !9, file: !10, line: 8, type: !28, isLocal: false, isDefinition: false, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false)
+!28 = !DISubroutineType(types: !29)
+!29 = !{null, !30}
+!30 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!35 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/ThinLTO/X86/dicompositetype-unique2.ll b/test/ThinLTO/X86/dicompositetype-unique2.ll
new file mode 100644
index 000000000000..924579569270
--- /dev/null
+++ b/test/ThinLTO/X86/dicompositetype-unique2.ll
@@ -0,0 +1,69 @@
+; RUN: opt -module-summary -o %t1.bc %s
+; RUN: opt -module-summary -o %t2.bc %S/Inputs/dicompositetype-unique2.ll
+; RUN: llvm-lto --thinlto-action=run %t1.bc %t2.bc -thinlto-save-temps=%t3.
+; RUN: llvm-dis %t3.0.3.imported.bc -o - | FileCheck %s
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t --save-temps \
+; RUN: -r %t1.bc,_ZN1CD2Ev,pl \
+; RUN: -r %t1.bc,_ZN4CFVSD2Ev,l \
+; RUN: -r %t1.bc,_Z3Getv,l \
+; RUN: -r %t2.bc,_ZN4CFVSD2Ev,pl \
+; RUN: -r %t2.bc,_Z3Getv,l
+; RUN: llvm-dis %t.1.3.import.bc -o - | FileCheck %s
+
+; Only llvm-lto2 adds the dso_local keyword, hence the {{.*}}
+; CHECK: define available_externally{{.*}} void @_ZN4CFVSD2Ev
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-scei-ps4"
+
+%class.C = type <{ i32 (...)**, %class.A, %struct.CFVS, [6 x i8] }>
+%class.A = type { %struct.Vec }
+%struct.Vec = type { i8 }
+%struct.CFVS = type { %struct.Vec }
+%struct.S = type { i8 }
+
+define void @_ZN1CD2Ev(%class.C* %this) unnamed_addr align 2 !dbg !8 {
+entry:
+ %this.addr = alloca %class.C*, align 8
+ %this1 = load %class.C*, %class.C** %this.addr, align 8
+ %m = getelementptr inbounds %class.C, %class.C* %this1, i32 0, i32 2
+ call void @_ZN4CFVSD2Ev(%struct.CFVS* %m), !dbg !50
+ ret void
+}
+
+declare void @_ZN4CFVSD2Ev(%struct.CFVS*) unnamed_addr
+
+declare dereferenceable(1) %struct.S* @_Z3Getv()
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 321360) (llvm/trunk 321359)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "bz188598-a.cpp", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!8 = distinct !DISubprogram(name: "~C", linkageName: "_ZN1CD2Ev", scope: !9, file: !1, line: 9, type: !47, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0, declaration: !46, variables: !2)
+!9 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "C", file: !1, line: 5, size: 128, elements: !10, vtableHolder: !9, identifier: "_ZTS1C")
+!10 = !{!38, !46}
+!15 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Vec<&Get>", file: !16, line: 4, size: 8, elements: !17, templateParams: !22, identifier: "_ZTS3VecIXadL_Z3GetvEEE")
+!16 = !DIFile(filename: "./bz188598.h", directory: ".")
+!17 = !{!55}
+!22 = !{!23}
+!23 = !DITemplateValueParameter(name: "F", type: !24, value: %struct.S* ()* @_Z3Getv)
+!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !25, size: 64)
+!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "Func", file: !16, line: 2, baseType: !26)
+!26 = !DISubroutineType(types: !27)
+!27 = !{!55}
+!38 = !DIDerivedType(tag: DW_TAG_member, name: "m", scope: !9, file: !1, line: 7, baseType: !39, size: 8, offset: 72)
+!39 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CFVS", file: !16, line: 7, size: 8, elements: !40, identifier: "_ZTS4CFVS")
+!40 = !{!41}
+!41 = !DIDerivedType(tag: DW_TAG_member, name: "m_val", scope: !39, file: !16, line: 9, baseType: !15, size: 8)
+!46 = !DISubprogram(name: "~C", scope: !9, file: !1, line: 6, type: !47, isLocal: false, isDefinition: false, scopeLine: 6, containingType: !9, virtuality: DW_VIRTUALITY_virtual, virtualIndex: 0, flags: DIFlagPrototyped, isOptimized: false)
+!47 = !DISubroutineType(types: !48)
+!48 = !{!55}
+!50 = !DILocation(line: 9, scope: !51)
+!51 = distinct !DILexicalBlock(scope: !8, file: !1, line: 9)
+!55 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrmode-select.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrmode-select.ll
new file mode 100644
index 000000000000..b153a8b1e53f
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode-select.ll
@@ -0,0 +1,19 @@
+; RUN: opt -S -codegenprepare -disable-complex-addr-modes=false -addr-sink-new-select=true %s | FileCheck %s --check-prefix=CHECK
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Select when both offset and scale reg are present.
+define i64 @test1(i1 %c, i64* %b, i64 %scale) {
+; CHECK-LABEL: @test1
+entry:
+; CHECK-LABEL: entry:
+ %g = getelementptr inbounds i64, i64* %b, i64 %scale
+ %g1 = getelementptr inbounds i64, i64* %g, i64 8
+ %g2 = getelementptr inbounds i64, i64* %g, i64 16
+ %s = select i1 %c, i64* %g1, i64* %g2
+; CHECK-NOT: sunkaddr
+ %v = load i64 , i64* %s, align 8
+ ret i64 %v
+}
+
diff --git a/test/Transforms/GVNHoist/pr35222-hoist-load.ll b/test/Transforms/GVNHoist/pr35222-hoist-load.ll
index 7e9c62006162..b9b1a870a59b 100644
--- a/test/Transforms/GVNHoist/pr35222-hoist-load.ll
+++ b/test/Transforms/GVNHoist/pr35222-hoist-load.ll
@@ -1,4 +1,5 @@
; RUN: opt -S -gvn-hoist < %s | FileCheck %s
+; CHECK-LABEL: build_tree
; CHECK: load
; CHECK: load
; Check that the load is not hoisted because the call can potentially
@@ -23,3 +24,47 @@ do.end: ; preds = %do.body
}
declare i1 @pqdownheap(i32)
+
+@i = external hidden unnamed_addr global i32, align 4
+@j = external hidden unnamed_addr global [573 x i32], align 4
+@v = external global i1
+
+; CHECK-LABEL: test
+; CHECK-LABEL: do.end
+; CHECK: load
+; Check that the load is not hoisted because the call can potentially
+; modify the global
+
+define i32 @test() {
+entry:
+ br label %for.cond
+
+for.cond:
+ %a3 = load volatile i1, i1* @v
+ br i1 %a3, label %for.body, label %while.end
+
+for.body:
+ br label %if.then
+
+if.then:
+ %tmp4 = load i32, i32* @i, align 4
+ br label %for.cond
+
+while.end:
+ br label %do.body
+
+do.body:
+ %tmp9 = load i32, i32* getelementptr inbounds ([573 x i32], [573 x i32]* @j,
+i32 0, i32 1), align 4
+ %tmp10 = load i32, i32* @i, align 4
+ call void @fn()
+ %a1 = load volatile i1, i1* @v
+ br i1 %a1, label %do.body, label %do.end
+
+do.end:
+ %tmp20 = load i32, i32* getelementptr inbounds ([573 x i32], [573 x i32]* @j,
+i32 0, i32 1), align 4
+ ret i32 %tmp20
+}
+
+declare void @fn()
diff --git a/test/Transforms/JumpThreading/ddt-crash3.ll b/test/Transforms/JumpThreading/ddt-crash3.ll
new file mode 100644
index 000000000000..50ac86a3fb5b
--- /dev/null
+++ b/test/Transforms/JumpThreading/ddt-crash3.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s -jump-threading -disable-output -verify-dom-info
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@global = external local_unnamed_addr global i64, align 8
+@global.1 = external local_unnamed_addr global i64, align 8
+@global.2 = external local_unnamed_addr global i64, align 8
+
+; Function Attrs: norecurse noreturn nounwind uwtable
+define void @hoge() local_unnamed_addr #0 {
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb26, %bb
+ %tmp = load i64, i64* @global, align 8, !tbaa !1
+ %tmp2 = icmp eq i64 %tmp, 0
+ br i1 %tmp2, label %bb27, label %bb3
+
+bb3: ; preds = %bb1
+ %tmp4 = load i64, i64* @global.1, align 8, !tbaa !1
+ %tmp5 = icmp eq i64 %tmp4, 0
+ br i1 %tmp5, label %bb23, label %bb23
+
+bb23: ; preds = %bb3, %bb3
+ br label %bb26
+
+bb26: ; preds = %bb27, %bb23
+ br label %bb1
+
+bb27: ; preds = %bb1
+ br label %bb26
+}
+
+attributes #0 = { norecurse noreturn nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 7.0.0 "}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"long", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/JumpThreading/ddt-crash4.ll b/test/Transforms/JumpThreading/ddt-crash4.ll
new file mode 100644
index 000000000000..9bf08395d660
--- /dev/null
+++ b/test/Transforms/JumpThreading/ddt-crash4.ll
@@ -0,0 +1,75 @@
+; RUN: opt < %s -jump-threading -disable-output -verify-dom-info
+@global = external global i64, align 8
+
+define void @f() {
+bb:
+ br label %bb1
+
+bb1:
+ %tmp = load i64, i64* @global, align 8
+ %tmp2 = icmp eq i64 %tmp, 0
+ br i1 %tmp2, label %bb27, label %bb3
+
+bb3:
+ %tmp4 = load i64, i64* @global, align 8
+ %tmp5 = icmp eq i64 %tmp4, 0
+ br i1 %tmp5, label %bb6, label %bb7
+
+bb6:
+ br label %bb7
+
+bb7:
+ %tmp8 = phi i1 [ true, %bb3 ], [ undef, %bb6 ]
+ %tmp9 = select i1 %tmp8, i64 %tmp4, i64 0
+ br i1 false, label %bb10, label %bb23
+
+bb10:
+ %tmp11 = load i64, i64* @global, align 8
+ %tmp12 = icmp slt i64 %tmp11, 5
+ br i1 %tmp12, label %bb13, label %bb17
+
+bb13:
+ br label %bb14
+
+bb14:
+ br i1 undef, label %bb15, label %bb16
+
+bb15:
+ unreachable
+
+bb16:
+ br label %bb10
+
+bb17:
+ br label %bb18
+
+bb18:
+ br i1 undef, label %bb22, label %bb13
+
+bb19:
+ br i1 undef, label %bb20, label %bb21
+
+bb20:
+ unreachable
+
+bb21:
+ br label %bb18
+
+bb22:
+ br label %bb23
+
+bb23:
+ br i1 undef, label %bb24, label %bb13
+
+bb24:
+ br i1 undef, label %bb26, label %bb25
+
+bb25:
+ br label %bb19
+
+bb26:
+ br label %bb1
+
+bb27:
+ br label %bb24
+}
diff --git a/test/Transforms/LoopVectorize/pr35773.ll b/test/Transforms/LoopVectorize/pr35773.ll
new file mode 100644
index 000000000000..362ece70b898
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr35773.ll
@@ -0,0 +1,53 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+@a = common local_unnamed_addr global i32 0, align 4
+@b = common local_unnamed_addr global i8 0, align 1
+
+; Function Attrs: norecurse nounwind uwtable
+define void @doit1() local_unnamed_addr{
+entry:
+ br label %for.body
+
+for.body:
+ %main.iv = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+
+ %i8.iv = phi i8 [ 0, %entry ], [ %i8.add, %for.body ]
+ %i32.iv = phi i32 [ 0, %entry ], [ %i32.add, %for.body ]
+
+ %trunc.to.be.converted.to.new.iv = trunc i32 %i32.iv to i8
+ %i8.add = add i8 %i8.iv, %trunc.to.be.converted.to.new.iv
+
+ %noop.conv.under.pse = and i32 %i32.iv, 255
+ %i32.add = add nuw nsw i32 %noop.conv.under.pse, 9
+
+ %inc = add i32 %main.iv, 1
+ %tobool = icmp eq i32 %inc, 16
+ br i1 %tobool, label %for.cond.for.end_crit_edge, label %for.body
+
+; CHECK-LABEL: @doit1(
+; CHECK: vector.body:
+; CHECK-NEXT: [[MAIN_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[MAIN_IV_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
+; CHECK-NEXT: [[I8_IV:%.*]] = phi <4 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[I8_IV_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[I32_IV:%.*]] = phi <4 x i32> [ <i32 0, i32 9, i32 18, i32 27>, [[VECTOR_PH]] ], [ [[I32_IV_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[IV_FROM_TRUNC:%.*]] = phi <4 x i8> [ <i8 0, i8 9, i8 18, i8 27>, [[VECTOR_PH]] ], [ [[IV_FROM_TRUNC_NEXT:%.*]], [[VECTOR_BODY]] ]
+
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[MAIN_IV]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[MAIN_IV]], 0
+
+; CHECK-NEXT: [[I8_IV_NEXT]] = add <4 x i8> [[I8_IV]], [[IV_FROM_TRUNC]]
+
+; CHECK-NEXT: [[MAIN_IV_NEXT]] = add i32 [[MAIN_IV]], 4
+; CHECK-NEXT: [[I32_IV_NEXT]] = add <4 x i32> [[I32_IV]], <i32 36, i32 36, i32 36, i32 36>
+; CHECK-NEXT: [[IV_FROM_TRUNC_NEXT]] = add <4 x i8> [[IV_FROM_TRUNC]], <i8 36, i8 36, i8 36, i8 36>
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[MAIN_IV_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+
+for.cond.for.end_crit_edge:
+ store i8 %i8.add, i8* @b, align 1
+ br label %for.end
+
+for.end:
+ ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/PR35628_1.ll b/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
new file mode 100644
index 000000000000..a573fc911eef
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/PR35628_1.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+define void @mainTest(i32* %ptr) #0 {
+; CHECK-LABEL: @mainTest(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32* [[PTR:%.*]], null
+; CHECK-NEXT: br i1 [[CMP]], label [[LOOP:%.*]], label [[BAIL_OUT:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA5:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 2
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 3
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[PTR]] to <4 x i32>*
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[TMP4]], [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 1, undef
+; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], undef
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], undef
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], undef
+; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], undef
+; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP6]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP13]], undef
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP8]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP16]], 1
+; CHECK-NEXT: [[OP_EXTRA3:%.*]] = add i32 [[OP_EXTRA]], [[TMP7]]
+; CHECK-NEXT: [[OP_EXTRA4:%.*]] = add i32 [[OP_EXTRA3]], [[TMP6]]
+; CHECK-NEXT: [[OP_EXTRA5]] = add i32 [[OP_EXTRA4]], [[TMP5]]
+; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP15]], undef
+; CHECK-NEXT: br label [[LOOP]]
+; CHECK: bail_out:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cmp = icmp eq i32* %ptr, null
+ br i1 %cmp, label %loop, label %bail_out
+
+loop:
+ %dummy_phi = phi i32 [ 1, %entry ], [ %18, %loop ]
+ %0 = load i32, i32 * %ptr , align 4
+ %1 = mul i32 %0, %0
+ %2 = add i32 1, %1
+ %3 = getelementptr inbounds i32, i32 * %ptr, i64 1
+ %4 = load i32, i32 * %3 , align 4
+ %5 = mul i32 %4, %4
+ %6 = add i32 %2, %4
+ %7 = add i32 %6, %5
+ %8 = getelementptr inbounds i32, i32 *%ptr, i64 2
+ %9 = load i32, i32 * %8 , align 4
+ %10 = mul i32 %9, %9
+ %11 = add i32 %7, %9
+ %12 = add i32 %11, %10
+ %13 = sext i32 %9 to i64
+ %14 = getelementptr inbounds i32, i32 *%ptr, i64 3
+ %15 = load i32, i32 * %14 , align 4
+ %16 = mul i32 %15, %15
+ %17 = add i32 %12, %15
+ %18 = add i32 %17, %16
+ br label %loop
+
+bail_out:
+ ret void
+}
+
+attributes #0 = { "target-cpu"="westmere" }
+
diff --git a/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
new file mode 100644
index 000000000000..52a6d73db981
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+
+define void @test() #0 {
+; CHECK-LABEL: @test(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA3:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP6:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMP0]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP0]], i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP0]], i32 3
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> <i64 3, i64 2, i64 1, i64 0>, [[TMP4]]
+; CHECK-NEXT: [[TMP6]] = extractelement <4 x i64> [[TMP5]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
+; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[TMP7]], 32
+; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i64> <i64 1, i64 1, i64 1, i64 1>, [[TMP5]]
+; CHECK-NEXT: [[TMP9:%.*]] = ashr exact <4 x i64> [[TMP8]], <i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT: [[SUM1:%.*]] = add i64 undef, undef
+; CHECK-NEXT: [[SUM2:%.*]] = add i64 [[SUM1]], undef
+; CHECK-NEXT: [[ZSUM:%.*]] = add i64 [[SUM2]], 0
+; CHECK-NEXT: [[JOIN:%.*]] = add i64 undef, [[ZSUM]]
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP9]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
+; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i64 [[TMP10]], 0
+; CHECK-NEXT: [[OP_EXTRA3]] = add i64 [[OP_EXTRA]], [[TMP6]]
+; CHECK-NEXT: [[LAST:%.*]] = add i64 [[JOIN]], undef
+; CHECK-NEXT: br label [[LOOP]]
+;
+entry:
+ br label %loop
+
+loop:
+ %dummy_phi = phi i64 [ 1, %entry ], [ %last, %loop ]
+ %0 = phi i64 [ 2, %entry ], [ %fork, %loop ]
+ %inc1 = add i64 %0, 1
+ %inc2 = add i64 %0, 2
+ %inc11 = add i64 1, %inc1
+ %exact1 = ashr exact i64 %inc11, 32
+ %inc3 = add i64 %0, 3
+ %dummy_add = add i16 0, 0
+ %inc12 = add i64 1, %inc2
+ %exact2 = ashr exact i64 %inc12, 32
+ %dummy_shl = shl i64 %inc3, 32
+ %inc13 = add i64 1, %inc3
+ %exact3 = ashr exact i64 %inc13, 32
+ %fork = add i64 %0, 0
+ %sum1 = add i64 %exact3, %exact2
+ %sum2 = add i64 %sum1, %exact1
+ %zsum = add i64 %sum2, 0
+ %sext22 = add i64 1, %fork
+ %exact4 = ashr exact i64 %sext22, 32
+ %join = add i64 %fork, %zsum
+ %last = add i64 %join, %exact4
+ br label %loop
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/PR35777.ll b/test/Transforms/SLPVectorizer/X86/PR35777.ll
new file mode 100644
index 000000000000..f3983d716d08
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/PR35777.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -verify -slp-vectorizer -o - -S -mtriple=x86_64-apple-macosx10.13.0 | FileCheck %s
+
+@global = local_unnamed_addr global [6 x double] zeroinitializer, align 16
+
+define { i64, i64 } @patatino(double %arg) {
+; CHECK-LABEL: @patatino(
+; CHECK-NEXT: bb:
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, <2 x double>* bitcast ([6 x double]* @global to <2 x double>*), align 16
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2) to <2 x double>*), align 16
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> undef, double [[ARG:%.*]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[ARG]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
+; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16
+; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
+; CHECK-NEXT: [[TMP10:%.*]] = trunc <2 x i64> [[TMP9]] to <2 x i32>
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
+; CHECK-NEXT: [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP12]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
+; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP14]], 1
+; CHECK-NEXT: ret { i64, i64 } [[TMP17]]
+;
+bb:
+ %tmp = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 0), align 16
+ %tmp1 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2), align 16
+ %tmp2 = fmul double %tmp1, %arg
+ %tmp3 = fadd double %tmp, %tmp2
+ %tmp4 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4), align 16
+ %tmp5 = fadd double %tmp4, %tmp3
+ %tmp6 = fptosi double %tmp5 to i32
+ %tmp7 = sext i32 %tmp6 to i64
+ %tmp8 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 1), align 8
+ %tmp9 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 3), align 8
+ %tmp10 = fmul double %tmp9, %arg
+ %tmp11 = fadd double %tmp8, %tmp10
+ %tmp12 = load double, double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 5), align 8
+ %tmp13 = fadd double %tmp12, %tmp11
+ %tmp14 = fptosi double %tmp13 to i32
+ %tmp15 = sext i32 %tmp14 to i64
+ %tmp16 = insertvalue { i64, i64 } undef, i64 %tmp7, 0
+ %tmp17 = insertvalue { i64, i64 } %tmp16, i64 %tmp15, 1
+ ret { i64, i64 } %tmp17
+}
diff --git a/test/Transforms/SLPVectorizer/X86/PR35865.ll b/test/Transforms/SLPVectorizer/X86/PR35865.ll
new file mode 100644
index 000000000000..b022dd7d9155
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/PR35865.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -slp-vectorizer < %s -S -o - -mtriple=x86_64-apple-macosx10.10.0 -mcpu=core2 | FileCheck %s
+
+define void @_Z10fooConvertPDv4_xS0_S0_PKS_() {
+; CHECK-LABEL: @_Z10fooConvertPDv4_xS0_S0_PKS_(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x half> undef, i32 4
+; CHECK-NEXT: [[CONV_I_4_I:%.*]] = fpext half [[TMP0]] to float
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[CONV_I_4_I]] to i32
+; CHECK-NEXT: [[VECINS_I_4_I:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 4
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x half> undef, i32 5
+; CHECK-NEXT: [[CONV_I_5_I:%.*]] = fpext half [[TMP2]] to float
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[CONV_I_5_I]] to i32
+; CHECK-NEXT: [[VECINS_I_5_I:%.*]] = insertelement <8 x i32> [[VECINS_I_4_I]], i32 [[TMP3]], i32 5
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = extractelement <16 x half> undef, i32 4
+ %conv.i.4.i = fpext half %0 to float
+ %1 = bitcast float %conv.i.4.i to i32
+ %vecins.i.4.i = insertelement <8 x i32> undef, i32 %1, i32 4
+ %2 = extractelement <16 x half> undef, i32 5
+ %conv.i.5.i = fpext half %2 to float
+ %3 = bitcast float %conv.i.5.i to i32
+ %vecins.i.5.i = insertelement <8 x i32> %vecins.i.4.i, i32 %3, i32 5
+ ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
index 46386e8b63e0..750a44736c97 100644
--- a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@@ -7,8 +7,8 @@ target triple = "x86_64-apple-macosx10.8.0"
define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
; CHECK-LABEL: @simple_select(
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@@ -20,8 +20,8 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
; CHECK-NEXT: ret <4 x float> [[RD]]
;
; ZEROTHRESH-LABEL: @simple_select(
-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
-; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
+; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@@ -64,18 +64,18 @@ declare void @llvm.assume(i1) nounwind
; This entire tree is ephemeral, don't vectorize any of it.
define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
; CHECK-LABEL: @simple_select_eph(
-; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
-; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
-; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> %c, i32 2
-; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> %c, i32 3
-; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> %a, i32 0
-; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> %a, i32 1
-; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> %a, i32 2
-; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> %a, i32 3
-; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> %b, i32 0
-; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> %b, i32 1
-; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> %b, i32 2
-; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> %b, i32 3
+; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
; CHECK-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
; CHECK-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
@@ -100,18 +100,18 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
; CHECK-NEXT: ret <4 x float> undef
;
; ZEROTHRESH-LABEL: @simple_select_eph(
-; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
-; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
-; ZEROTHRESH-NEXT: [[C2:%.*]] = extractelement <4 x i32> %c, i32 2
-; ZEROTHRESH-NEXT: [[C3:%.*]] = extractelement <4 x i32> %c, i32 3
-; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <4 x float> %a, i32 0
-; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> %a, i32 1
-; ZEROTHRESH-NEXT: [[A2:%.*]] = extractelement <4 x float> %a, i32 2
-; ZEROTHRESH-NEXT: [[A3:%.*]] = extractelement <4 x float> %a, i32 3
-; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <4 x float> %b, i32 0
-; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> %b, i32 1
-; ZEROTHRESH-NEXT: [[B2:%.*]] = extractelement <4 x float> %b, i32 2
-; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> %b, i32 3
+; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; ZEROTHRESH-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; ZEROTHRESH-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; ZEROTHRESH-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; ZEROTHRESH-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
; ZEROTHRESH-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
@@ -175,8 +175,8 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
; doesn't matter
define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
; CHECK-LABEL: @simple_select_insert_out_of_order(
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@@ -188,8 +188,8 @@ define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float
; CHECK-NEXT: ret <4 x float> [[RD]]
;
; ZEROTHRESH-LABEL: @simple_select_insert_out_of_order(
-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
-; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
+; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 2
; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@@ -233,8 +233,8 @@ declare void @f32_user(float) #0
; Multiple users of the final constructed vector
define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
; CHECK-LABEL: @simple_select_users(
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@@ -247,8 +247,8 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
; CHECK-NEXT: ret <4 x float> [[RD]]
;
; ZEROTHRESH-LABEL: @simple_select_users(
-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> %c, zeroinitializer
-; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> %a, <4 x float> %b
+; ZEROTHRESH-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
+; ZEROTHRESH-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
@@ -291,18 +291,18 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
; Unused insertelement
define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
; CHECK-LABEL: @simple_select_no_users(
-; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
-; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
-; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> %c, i32 2
-; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> %c, i32 3
-; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> %a, i32 0
-; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> %a, i32 1
-; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> %a, i32 2
-; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> %a, i32 3
-; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> %b, i32 0
-; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> %b, i32 1
-; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> %b, i32 2
-; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> %b, i32 3
+; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
@@ -330,18 +330,18 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
; CHECK-NEXT: ret <4 x float> [[RD]]
;
; ZEROTHRESH-LABEL: @simple_select_no_users(
-; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
-; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
-; ZEROTHRESH-NEXT: [[C2:%.*]] = extractelement <4 x i32> %c, i32 2
-; ZEROTHRESH-NEXT: [[C3:%.*]] = extractelement <4 x i32> %c, i32 3
-; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <4 x float> %a, i32 0
-; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> %a, i32 1
-; ZEROTHRESH-NEXT: [[A2:%.*]] = extractelement <4 x float> %a, i32 2
-; ZEROTHRESH-NEXT: [[A3:%.*]] = extractelement <4 x float> %a, i32 3
-; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <4 x float> %b, i32 0
-; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> %b, i32 1
-; ZEROTHRESH-NEXT: [[B2:%.*]] = extractelement <4 x float> %b, i32 2
-; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> %b, i32 3
+; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; ZEROTHRESH-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; ZEROTHRESH-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; ZEROTHRESH-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
+; ZEROTHRESH-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2
+; ZEROTHRESH-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3
; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
; ZEROTHRESH-NEXT: [[CMP2:%.*]] = icmp ne i32 [[C2]], 0
@@ -387,25 +387,25 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
; to do this backwards this backwards
define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
; CHECK-LABEL: @reconstruct(
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> %c, i32 0
-; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> %c, i32 1
-; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> %c, i32 2
-; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[TMP3]], i32 2
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> %c, i32 3
-; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[TMP4]], i32 3
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[C]], i32 0
+; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[TMP3]], i32 1
+; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[TMP2]], i32 2
+; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[TMP1]], i32 3
; CHECK-NEXT: ret <4 x i32> [[RD]]
;
; ZEROTHRESH-LABEL: @reconstruct(
-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> %c, i32 0
-; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
-; ZEROTHRESH-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> %c, i32 1
-; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[TMP2]], i32 1
-; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> %c, i32 2
-; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[TMP3]], i32 2
-; ZEROTHRESH-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> %c, i32 3
-; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[TMP4]], i32 3
+; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2
+; ZEROTHRESH-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3
+; ZEROTHRESH-NEXT: [[RA:%.*]] = insertelement <4 x i32> undef, i32 [[C0]], i32 0
+; ZEROTHRESH-NEXT: [[RB:%.*]] = insertelement <4 x i32> [[RA]], i32 [[C1]], i32 1
+; ZEROTHRESH-NEXT: [[RC:%.*]] = insertelement <4 x i32> [[RB]], i32 [[C2]], i32 2
+; ZEROTHRESH-NEXT: [[RD:%.*]] = insertelement <4 x i32> [[RC]], i32 [[C3]], i32 3
; ZEROTHRESH-NEXT: ret <4 x i32> [[RD]]
;
%c0 = extractelement <4 x i32> %c, i32 0
@@ -421,8 +421,8 @@ define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
; CHECK-LABEL: @simple_select_v2(
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> %c, zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> %a, <2 x float> %b
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
; CHECK-NEXT: [[RA:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
@@ -430,12 +430,12 @@ define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %
; CHECK-NEXT: ret <2 x float> [[RB]]
;
; ZEROTHRESH-LABEL: @simple_select_v2(
-; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <2 x i32> %c, i32 0
-; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <2 x i32> %c, i32 1
-; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <2 x float> %a, i32 0
-; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <2 x float> %a, i32 1
-; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <2 x float> %b, i32 0
-; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <2 x float> %b, i32 1
+; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <2 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <2 x float> [[A:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <2 x float> [[A]], i32 1
+; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <2 x float> [[B:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <2 x float> [[B]], i32 1
; ZEROTHRESH-NEXT: [[CMP0:%.*]] = icmp ne i32 [[C0]], 0
; ZEROTHRESH-NEXT: [[CMP1:%.*]] = icmp ne i32 [[C1]], 0
; ZEROTHRESH-NEXT: [[S0:%.*]] = select i1 [[CMP0]], float [[A0]], float [[B0]]
@@ -464,12 +464,12 @@ define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %
; (low cost threshold needed to force this to happen)
define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
; CHECK-LABEL: @simple_select_partial_vector(
-; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
-; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
-; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> %a, i32 0
-; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> %a, i32 1
-; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> %b, i32 0
-; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> %b, i32 1
+; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
@@ -485,12 +485,12 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b,
; CHECK-NEXT: ret <4 x float> [[RB]]
;
; ZEROTHRESH-LABEL: @simple_select_partial_vector(
-; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <4 x i32> %c, i32 0
-; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> %c, i32 1
-; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <4 x float> %a, i32 0
-; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> %a, i32 1
-; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <4 x float> %b, i32 0
-; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> %b, i32 1
+; ZEROTHRESH-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
+; ZEROTHRESH-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; ZEROTHRESH-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1
; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0
; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1
; ZEROTHRESH-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer
@@ -530,7 +530,7 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b,
; must be rescheduled. The case here is from compiling Julia.
define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @reschedule_extract(
-; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> %a, %b
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@@ -542,7 +542,7 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
; CHECK-NEXT: ret <4 x float> [[V3]]
;
; ZEROTHRESH-LABEL: @reschedule_extract(
-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = fadd <4 x float> %a, %b
+; ZEROTHRESH-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
; ZEROTHRESH-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
; ZEROTHRESH-NEXT: [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@@ -576,7 +576,7 @@ define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
; instructions that are erased.
define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: @take_credit(
-; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> %a, %b
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
; CHECK-NEXT: [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@@ -588,7 +588,7 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
; CHECK-NEXT: ret <4 x float> [[V3]]
;
; ZEROTHRESH-LABEL: @take_credit(
-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = fadd <4 x float> %a, %b
+; ZEROTHRESH-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A:%.*]], [[B:%.*]]
; ZEROTHRESH-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
; ZEROTHRESH-NEXT: [[V0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
; ZEROTHRESH-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
@@ -622,10 +622,10 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
; CHECK-LABEL: @multi_tree(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> undef, double %w, i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double %x, i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double %y, i32 2
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double %z, i32 3
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> undef, double [[W:%.*]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[X:%.*]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[Y:%.*]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[Z:%.*]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
@@ -640,10 +640,10 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
;
; ZEROTHRESH-LABEL: @multi_tree(
; ZEROTHRESH-NEXT: entry:
-; ZEROTHRESH-NEXT: [[TMP0:%.*]] = insertelement <4 x double> undef, double %w, i32 0
-; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double %x, i32 1
-; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double %y, i32 2
-; ZEROTHRESH-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double %z, i32 3
+; ZEROTHRESH-NEXT: [[TMP0:%.*]] = insertelement <4 x double> undef, double [[W:%.*]], i32 0
+; ZEROTHRESH-NEXT: [[TMP1:%.*]] = insertelement <4 x double> [[TMP0]], double [[X:%.*]], i32 1
+; ZEROTHRESH-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[Y:%.*]], i32 2
+; ZEROTHRESH-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[Z:%.*]], i32 3
; ZEROTHRESH-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
; ZEROTHRESH-NEXT: [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
; ZEROTHRESH-NEXT: [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
@@ -675,7 +675,7 @@ entry:
define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
; CHECK-LABEL: @_vadd256(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = fadd <8 x float> %a, %b
+; CHECK-NEXT: [[TMP0:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
; CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
@@ -696,7 +696,7 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr
;
; ZEROTHRESH-LABEL: @_vadd256(
; ZEROTHRESH-NEXT: entry:
-; ZEROTHRESH-NEXT: [[TMP0:%.*]] = fadd <8 x float> %a, %b
+; ZEROTHRESH-NEXT: [[TMP0:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
; ZEROTHRESH-NEXT: [[TMP1:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
; ZEROTHRESH-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0
; ZEROTHRESH-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
diff --git a/test/Transforms/SLPVectorizer/X86/insertvalue.ll b/test/Transforms/SLPVectorizer/X86/insertvalue.ll
index 5884ee7a2675..1af11609fe6f 100644
--- a/test/Transforms/SLPVectorizer/X86/insertvalue.ll
+++ b/test/Transforms/SLPVectorizer/X86/insertvalue.ll
@@ -1,11 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
-; CHECK-LABEL: julia_2xdouble
-; CHECK: load <2 x double>
-; CHECK: load <2 x double>
-; CHECK: fmul <2 x double>
-; CHECK: fadd <2 x double>
define void @julia_2xdouble([2 x double]* sret, [2 x double]*, [2 x double]*, [2 x double]*) {
+; CHECK-LABEL: @julia_2xdouble(
+; CHECK-NEXT: top:
+; CHECK-NEXT: [[PX0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2:%.*]], i64 0, i64 0
+; CHECK-NEXT: [[PY0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3:%.*]], i64 0, i64 0
+; CHECK-NEXT: [[PX1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2]], i64 0, i64 1
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[PX0]] to <2 x double>*
+; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 4
+; CHECK-NEXT: [[PY1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3]], i64 0, i64 1
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[PY0]] to <2 x double>*
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]]
+; CHECK-NEXT: [[PZ0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1:%.*]], i64 0, i64 0
+; CHECK-NEXT: [[PZ1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1]], i64 0, i64 1
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[PZ0]] to <2 x double>*
+; CHECK-NEXT: [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 4
+; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
+; CHECK-NEXT: [[I0:%.*]] = insertvalue [2 x double] undef, double [[TMP12]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP11]], i32 1
+; CHECK-NEXT: [[I1:%.*]] = insertvalue [2 x double] [[I0]], double [[TMP13]], 1
+; CHECK-NEXT: store [2 x double] [[I1]], [2 x double]* [[TMP0:%.*]], align 4
+; CHECK-NEXT: ret void
+;
top:
%px0 = getelementptr inbounds [2 x double], [2 x double]* %2, i64 0, i64 0
%x0 = load double, double* %px0, align 4
@@ -29,12 +48,40 @@ top:
ret void
}
-; CHECK-LABEL: julia_4xfloat
-; CHECK: load <4 x float>
-; CHECK: load <4 x float>
-; CHECK: fmul <4 x float>
-; CHECK: fadd <4 x float>
define void @julia_4xfloat([4 x float]* sret, [4 x float]*, [4 x float]*, [4 x float]*) {
+; CHECK-LABEL: @julia_4xfloat(
+; CHECK-NEXT: top:
+; CHECK-NEXT: [[PX0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2:%.*]], i64 0, i64 0
+; CHECK-NEXT: [[PY0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3:%.*]], i64 0, i64 0
+; CHECK-NEXT: [[PX1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 1
+; CHECK-NEXT: [[PY1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 1
+; CHECK-NEXT: [[PX2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 2
+; CHECK-NEXT: [[PY2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 2
+; CHECK-NEXT: [[PX3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 3
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[PX0]] to <4 x float>*
+; CHECK-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
+; CHECK-NEXT: [[PY3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 3
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[PY0]] to <4 x float>*
+; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]]
+; CHECK-NEXT: [[PZ0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1:%.*]], i64 0, i64 0
+; CHECK-NEXT: [[PZ1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 1
+; CHECK-NEXT: [[PZ2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 2
+; CHECK-NEXT: [[PZ3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 3
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[PZ0]] to <4 x float>*
+; CHECK-NEXT: [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4
+; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 0
+; CHECK-NEXT: [[I0:%.*]] = insertvalue [4 x float] undef, float [[TMP12]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 1
+; CHECK-NEXT: [[I1:%.*]] = insertvalue [4 x float] [[I0]], float [[TMP13]], 1
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 2
+; CHECK-NEXT: [[I2:%.*]] = insertvalue [4 x float] [[I1]], float [[TMP14]], 2
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 3
+; CHECK-NEXT: [[I3:%.*]] = insertvalue [4 x float] [[I2]], float [[TMP15]], 3
+; CHECK-NEXT: store [4 x float] [[I3]], [4 x float]* [[TMP0:%.*]], align 4
+; CHECK-NEXT: ret void
+;
top:
%px0 = getelementptr inbounds [4 x float], [4 x float]* %2, i64 0, i64 0
%x0 = load float, float* %px0, align 4
@@ -76,9 +123,27 @@ top:
ret void
}
-; CHECK-LABEL: julia_load_array_of_float
-; CHECK: fsub <4 x float>
define void @julia_load_array_of_float([4 x float]* %a, [4 x float]* %b, [4 x float]* %c) {
+; CHECK-LABEL: @julia_load_array_of_float(
+; CHECK-NEXT: top:
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast [4 x float]* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT: [[A_ARR:%.*]] = load [4 x float], [4 x float]* [[A]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast [4 x float]* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT: [[B_ARR:%.*]] = load [4 x float], [4 x float]* [[B]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; CHECK-NEXT: [[C_ARR0:%.*]] = insertvalue [4 x float] undef, float [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; CHECK-NEXT: [[C_ARR1:%.*]] = insertvalue [4 x float] [[C_ARR0]], float [[TMP6]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; CHECK-NEXT: [[C_ARR2:%.*]] = insertvalue [4 x float] [[C_ARR1]], float [[TMP7]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; CHECK-NEXT: [[C_ARR3:%.*]] = insertvalue [4 x float] [[C_ARR2]], float [[TMP8]], 3
+; CHECK-NEXT: store [4 x float] [[C_ARR3]], [4 x float]* [[C:%.*]], align 4
+; CHECK-NEXT: ret void
+;
top:
%a_arr = load [4 x float], [4 x float]* %a, align 4
%a0 = extractvalue [4 x float] %a_arr, 0
@@ -102,11 +167,27 @@ top:
ret void
}
-; CHECK-LABEL: julia_load_array_of_i32
-; CHECK: load <4 x i32>
-; CHECK: load <4 x i32>
-; CHECK: sub <4 x i32>
define void @julia_load_array_of_i32([4 x i32]* %a, [4 x i32]* %b, [4 x i32]* %c) {
+; CHECK-LABEL: @julia_load_array_of_i32(
+; CHECK-NEXT: top:
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast [4 x i32]* [[A:%.*]] to <4 x i32>*
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT: [[A_ARR:%.*]] = load [4 x i32], [4 x i32]* [[A]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast [4 x i32]* [[B:%.*]] to <4 x i32>*
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT: [[B_ARR:%.*]] = load [4 x i32], [4 x i32]* [[B]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; CHECK-NEXT: [[C_ARR0:%.*]] = insertvalue [4 x i32] undef, i32 [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT: [[C_ARR1:%.*]] = insertvalue [4 x i32] [[C_ARR0]], i32 [[TMP6]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT: [[C_ARR2:%.*]] = insertvalue [4 x i32] [[C_ARR1]], i32 [[TMP7]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT: [[C_ARR3:%.*]] = insertvalue [4 x i32] [[C_ARR2]], i32 [[TMP8]], 3
+; CHECK-NEXT: store [4 x i32] [[C_ARR3]], [4 x i32]* [[C:%.*]], align 4
+; CHECK-NEXT: ret void
+;
top:
%a_arr = load [4 x i32], [4 x i32]* %a, align 4
%a0 = extractvalue [4 x i32] %a_arr, 0
@@ -132,9 +213,30 @@ top:
; Almost identical to previous test, but for type that should NOT be vectorized.
;
-; CHECK-LABEL: julia_load_array_of_i16
-; CHECK-NOT: i2>
define void @julia_load_array_of_i16([4 x i16]* %a, [4 x i16]* %b, [4 x i16]* %c) {
+; CHECK-LABEL: @julia_load_array_of_i16(
+; CHECK-NEXT: top:
+; CHECK-NEXT: [[A_ARR:%.*]] = load [4 x i16], [4 x i16]* [[A:%.*]], align 4
+; CHECK-NEXT: [[A0:%.*]] = extractvalue [4 x i16] [[A_ARR]], 0
+; CHECK-NEXT: [[A2:%.*]] = extractvalue [4 x i16] [[A_ARR]], 2
+; CHECK-NEXT: [[A1:%.*]] = extractvalue [4 x i16] [[A_ARR]], 1
+; CHECK-NEXT: [[B_ARR:%.*]] = load [4 x i16], [4 x i16]* [[B:%.*]], align 4
+; CHECK-NEXT: [[B0:%.*]] = extractvalue [4 x i16] [[B_ARR]], 0
+; CHECK-NEXT: [[B2:%.*]] = extractvalue [4 x i16] [[B_ARR]], 2
+; CHECK-NEXT: [[B1:%.*]] = extractvalue [4 x i16] [[B_ARR]], 1
+; CHECK-NEXT: [[A3:%.*]] = extractvalue [4 x i16] [[A_ARR]], 3
+; CHECK-NEXT: [[C1:%.*]] = sub i16 [[A1]], [[B1]]
+; CHECK-NEXT: [[B3:%.*]] = extractvalue [4 x i16] [[B_ARR]], 3
+; CHECK-NEXT: [[C0:%.*]] = sub i16 [[A0]], [[B0]]
+; CHECK-NEXT: [[C2:%.*]] = sub i16 [[A2]], [[B2]]
+; CHECK-NEXT: [[C_ARR0:%.*]] = insertvalue [4 x i16] undef, i16 [[C0]], 0
+; CHECK-NEXT: [[C_ARR1:%.*]] = insertvalue [4 x i16] [[C_ARR0]], i16 [[C1]], 1
+; CHECK-NEXT: [[C3:%.*]] = sub i16 [[A3]], [[B3]]
+; CHECK-NEXT: [[C_ARR2:%.*]] = insertvalue [4 x i16] [[C_ARR1]], i16 [[C2]], 2
+; CHECK-NEXT: [[C_ARR3:%.*]] = insertvalue [4 x i16] [[C_ARR2]], i16 [[C3]], 3
+; CHECK-NEXT: store [4 x i16] [[C_ARR3]], [4 x i16]* [[C:%.*]], align 4
+; CHECK-NEXT: ret void
+;
top:
%a_arr = load [4 x i16], [4 x i16]* %a, align 4
%a0 = extractvalue [4 x i16] %a_arr, 0
@@ -160,11 +262,27 @@ top:
%pseudovec = type { float, float, float, float }
-; CHECK-LABEL: julia_load_struct_of_float
-; CHECK: load <4 x float>
-; CHECK: load <4 x float>
-; CHECK: fsub <4 x float>
define void @julia_load_struct_of_float(%pseudovec* %a, %pseudovec* %b, %pseudovec* %c) {
+; CHECK-LABEL: @julia_load_struct_of_float(
+; CHECK-NEXT: top:
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast %pseudovec* [[A:%.*]] to <4 x float>*
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
+; CHECK-NEXT: [[A_STRUCT:%.*]] = load [[PSEUDOVEC:%.*]], %pseudovec* [[A]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast %pseudovec* [[B:%.*]] to <4 x float>*
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; CHECK-NEXT: [[B_STRUCT:%.*]] = load [[PSEUDOVEC]], %pseudovec* [[B]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0
+; CHECK-NEXT: [[C_STRUCT0:%.*]] = insertvalue [[PSEUDOVEC]] undef, float [[TMP5]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+; CHECK-NEXT: [[C_STRUCT1:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct0, float [[TMP6]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
+; CHECK-NEXT: [[C_STRUCT2:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct1, float [[TMP7]], 2
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3
+; CHECK-NEXT: [[C_STRUCT3:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct2, float [[TMP8]], 3
+; CHECK-NEXT: store [[PSEUDOVEC]] %c_struct3, %pseudovec* [[C:%.*]], align 4
+; CHECK-NEXT: ret void
+;
top:
%a_struct = load %pseudovec, %pseudovec* %a, align 4
%a0 = extractvalue %pseudovec %a_struct, 0
diff --git a/test/Transforms/SLPVectorizer/X86/value-bug.ll b/test/Transforms/SLPVectorizer/X86/value-bug.ll
index 64d2ae1c7d79..7558c724a15d 100644
--- a/test/Transforms/SLPVectorizer/X86/value-bug.ll
+++ b/test/Transforms/SLPVectorizer/X86/value-bug.ll
@@ -1,15 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -slp-vectorizer < %s -S -mtriple="x86_64-grtev3-linux-gnu" -mcpu=corei7-avx | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-grtev3-linux-gnu"
; We used to crash on this example because we were building a constant
; expression during vectorization and the vectorizer expects instructions
; as elements of the vectorized tree.
-; CHECK-LABEL: @test
; PR19621
define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT: bb279:
+; CHECK-NEXT: br label [[BB283:%.*]]
+; CHECK: bb283:
+; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ undef, [[BB279:%.*]] ], [ [[TMP11:%.*]], [[EXIT:%.*]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x float> [ undef, [[BB279]] ], [ [[TMP13:%.*]], [[EXIT]] ]
+; CHECK-NEXT: br label [[BB284:%.*]]
+; CHECK: bb284:
+; CHECK-NEXT: [[TMP2:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double>
+; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP2]], undef
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
+; CHECK-NEXT: br label [[BB21_I:%.*]]
+; CHECK: bb21.i:
+; CHECK-NEXT: br i1 undef, label [[BB22_I:%.*]], label [[EXIT]]
+; CHECK: bb22.i:
+; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]]
+; CHECK-NEXT: br label [[BB32_I:%.*]]
+; CHECK: bb32.i:
+; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x double> [ [[TMP5]], [[BB22_I]] ], [ zeroinitializer, [[BB32_I]] ]
+; CHECK-NEXT: br i1 undef, label [[BB32_I]], label [[BB21_I]]
+; CHECK: exit:
+; CHECK-NEXT: [[TMP7:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> <double undef, double 0.000000e+00>, [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> undef, [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> undef, [[TMP9]]
+; CHECK-NEXT: [[TMP11]] = fptrunc <2 x double> [[TMP10]] to <2 x float>
+; CHECK-NEXT: [[TMP317:%.*]] = fptrunc double undef to float
+; CHECK-NEXT: [[TMP319:%.*]] = fptrunc double undef to float
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[TMP317]], i32 0
+; CHECK-NEXT: [[TMP13]] = insertelement <2 x float> [[TMP12]], float [[TMP319]], i32 1
+; CHECK-NEXT: br label [[BB283]]
+;
bb279:
br label %bb283
@@ -62,6 +93,12 @@ exit:
; vectorizer starts at the type (%t2, %t3) and wil constant fold the tree.
; The code that handles insertelement instructions must handle this.
define <4 x double> @constant_folding() {
+; CHECK-LABEL: @constant_folding(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x double> undef, double 1.000000e+00, i32 1
+; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x double> [[I1]], double 2.000000e+00, i32 0
+; CHECK-NEXT: ret <4 x double> [[I2]]
+;
entry:
%t0 = fadd double 1.000000e+00 , 0.000000e+00
%t1 = fadd double 1.000000e+00 , 1.000000e+00
@@ -71,10 +108,3 @@ entry:
%i2 = insertelement <4 x double> %i1, double %t3, i32 0
ret <4 x double> %i2
}
-
-; CHECK-LABEL: @constant_folding
-; CHECK: %[[V0:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 0
-; CHECK: %[[V1:.+]] = insertelement <4 x double> undef, double %[[V0]], i32 1
-; CHECK: %[[V2:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 1
-; CHECK: %[[V3:.+]] = insertelement <4 x double> %[[V1]], double %[[V2]], i32 0
-; CHECK: ret <4 x double> %[[V3]]
diff --git a/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug-xfail.ll b/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug-xfail.ll
new file mode 100644
index 000000000000..e9c54151cf29
--- /dev/null
+++ b/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug-xfail.ll
@@ -0,0 +1,77 @@
+; XFAIL: *
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg -verify-region-info %s
+
+; FIXME: Merge into backedge-id-bug
+; Variant which has an issue with region construction
+
+define amdgpu_kernel void @loop_backedge_misidentified_alt(i32 addrspace(1)* %arg0) #0 {
+entry:
+ %tmp = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
+ %load1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i32 %tid
+ %i.initial = load volatile i32, i32 addrspace(1)* %gep, align 4
+ br label %LOOP.HEADER
+
+LOOP.HEADER:
+ %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ]
+ call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0
+ %tmp12 = zext i32 %i to i64
+ %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 %tmp12
+ %tmp14 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp13, align 16
+ %tmp15 = extractelement <4 x i32> %tmp14, i64 0
+ %tmp16 = and i32 %tmp15, 65535
+ %tmp17 = icmp eq i32 %tmp16, 1
+ br i1 %tmp17, label %bb18, label %bb62
+
+bb18:
+ %tmp19 = extractelement <2 x i32> %tmp, i64 0
+ %tmp22 = lshr i32 %tmp19, 16
+ %tmp24 = urem i32 %tmp22, 52
+ %tmp25 = mul nuw nsw i32 %tmp24, 52
+ br label %INNER_LOOP
+
+INNER_LOOP:
+ %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ]
+ call void asm sideeffect "; inner loop body", ""() #0
+ %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1
+ %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0
+ br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP
+
+INNER_LOOP_BREAK:
+ %tmp59 = extractelement <4 x i32> %tmp14, i64 2
+ call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
+ br label %END_ELSE_BLOCK
+
+bb62:
+ %load13 = icmp ult i32 %tmp16, 271
+ ;br i1 %load13, label %bb64, label %INCREMENT_I
+ ; branching directly to the return avoids the bug
+ br i1 %load13, label %RETURN, label %INCREMENT_I
+
+
+bb64:
+ call void asm sideeffect "s_nop 42", "~{memory}"() #0
+ br label %RETURN
+
+INCREMENT_I:
+ %inc.i = add i32 %i, 1
+ call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0
+ br label %END_ELSE_BLOCK
+
+END_ELSE_BLOCK:
+ %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ]
+ call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0
+ %cmp.end.else.block = icmp eq i32 %i.final, -1
+ br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER
+
+RETURN:
+ call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0
+ store volatile <2 x float> %load1, <2 x float> addrspace(1)* undef, align 8
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { convergent nounwind readnone }
diff --git a/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll b/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll
new file mode 100644
index 000000000000..9cddffdd1795
--- /dev/null
+++ b/test/Transforms/StructurizeCFG/AMDGPU/backedge-id-bug.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg %s | FileCheck %s
+
+; StructurizeCFG::orderNodes used an arbitrary and nonsensical sorting
+; function which broke the basic backedge identification algorithm. It
+; would use RPO order, but then do a weird partial sort by the loop
+; depth assuming blocks are sorted by loop. However a block can appear
+; in between blocks of a loop that is not part of a loop, breaking the
+; assumption of the sort.
+;
+; The collectInfos must be done in RPO order. The actual
+; structurization order I think is less important, but unless the loop
+; headers are identified in RPO order, it finds the wrong set of back
+; edges.
+
+define amdgpu_kernel void @loop_backedge_misidentified(i32 addrspace(1)* %arg0) #0 {
+; CHECK-LABEL: @loop_backedge_misidentified(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP:%.*]] = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
+; CHECK-NEXT: [[LOAD1:%.*]] = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG0:%.*]], i32 [[TID]]
+; CHECK-NEXT: [[I_INITIAL:%.*]] = load volatile i32, i32 addrspace(1)* [[GEP]], align 4
+; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
+; CHECK: LOOP.HEADER:
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP10:%.*]], [[FLOW4:%.*]] ]
+; CHECK-NEXT: call void asm sideeffect "s_nop 0x100b
+; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[I]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP13]], align 16
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP14]], i64 0
+; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 1
+; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[TMP17]], true
+; CHECK-NEXT: br i1 [[TMP0]], label [[BB62:%.*]], label [[FLOW:%.*]]
+; CHECK: Flow2:
+; CHECK-NEXT: br label [[FLOW]]
+; CHECK: bb18:
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP]], i64 0
+; CHECK-NEXT: [[TMP22:%.*]] = lshr i32 [[TMP19]], 16
+; CHECK-NEXT: [[TMP24:%.*]] = urem i32 [[TMP22]], 52
+; CHECK-NEXT: [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
+; CHECK-NEXT: br label [[INNER_LOOP:%.*]]
+; CHECK: Flow3:
+; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP7:%.*]], [[FLOW]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP8:%.*]], [[FLOW]] ]
+; CHECK-NEXT: br i1 [[TMP2]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW4]]
+; CHECK: INNER_LOOP:
+; CHECK-NEXT: [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: [[INNER_LOOP_J_INC]] = add nsw i32 [[INNER_LOOP_J]], 1
+; CHECK-NEXT: [[INNER_LOOP_CMP:%.*]] = icmp eq i32 [[INNER_LOOP_J]], 0
+; CHECK-NEXT: br i1 [[INNER_LOOP_CMP]], label [[INNER_LOOP_BREAK]], label [[INNER_LOOP]]
+; CHECK: INNER_LOOP_BREAK:
+; CHECK-NEXT: [[TMP59]] = extractelement <4 x i32> [[TMP14]], i64 2
+; CHECK-NEXT: call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
+; CHECK-NEXT: br label [[FLOW3:%.*]]
+; CHECK: bb62:
+; CHECK-NEXT: [[LOAD13:%.*]] = icmp ult i32 [[TMP16]], 271
+; CHECK-NEXT: [[TMP3:%.*]] = xor i1 [[LOAD13]], true
+; CHECK-NEXT: br i1 [[TMP3]], label [[INCREMENT_I:%.*]], label [[FLOW1:%.*]]
+; CHECK: Flow1:
+; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[INC_I:%.*]], [[INCREMENT_I]] ], [ undef, [[BB62]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ true, [[INCREMENT_I]] ], [ false, [[BB62]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ false, [[INCREMENT_I]] ], [ true, [[BB62]] ]
+; CHECK-NEXT: br i1 [[TMP6]], label [[BB64:%.*]], label [[FLOW2:%.*]]
+; CHECK: bb64:
+; CHECK-NEXT: call void asm sideeffect "s_nop 42", "~{memory}"() #0
+; CHECK-NEXT: br label [[FLOW2]]
+; CHECK: Flow:
+; CHECK-NEXT: [[TMP7]] = phi i32 [ [[TMP4]], [[FLOW2]] ], [ undef, [[LOOP_HEADER]] ]
+; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP5]], [[FLOW2]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ false, [[FLOW2]] ], [ true, [[LOOP_HEADER]] ]
+; CHECK-NEXT: br i1 [[TMP9]], label [[BB18]], label [[FLOW3]]
+; CHECK: INCREMENT_I:
+; CHECK-NEXT: [[INC_I]] = add i32 [[I]], 1
+; CHECK-NEXT: call void asm sideeffect "s_nop 0x1336
+; CHECK-NEXT: br label [[FLOW1]]
+; CHECK: END_ELSE_BLOCK:
+; CHECK-NEXT: [[I_FINAL:%.*]] = phi i32 [ [[TMP1]], [[FLOW3]] ]
+; CHECK-NEXT: call void asm sideeffect "s_nop 0x1337
+; CHECK-NEXT: [[CMP_END_ELSE_BLOCK:%.*]] = icmp eq i32 [[I_FINAL]], -1
+; CHECK-NEXT: br label [[FLOW4]]
+; CHECK: Flow4:
+; CHECK-NEXT: [[TMP10]] = phi i32 [ [[I_FINAL]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW3]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW3]] ]
+; CHECK-NEXT: br i1 [[TMP11]], label [[RETURN:%.*]], label [[LOOP_HEADER]]
+; CHECK: RETURN:
+; CHECK-NEXT: call void asm sideeffect "s_nop 0x99
+; CHECK-NEXT: store volatile <2 x float> [[LOAD1]], <2 x float> addrspace(1)* undef, align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ %tmp = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
+ %load1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i32 %tid
+ %i.initial = load volatile i32, i32 addrspace(1)* %gep, align 4
+ br label %LOOP.HEADER
+
+LOOP.HEADER:
+ %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ]
+ call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0
+ %tmp12 = zext i32 %i to i64
+ %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 %tmp12
+ %tmp14 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp13, align 16
+ %tmp15 = extractelement <4 x i32> %tmp14, i64 0
+ %tmp16 = and i32 %tmp15, 65535
+ %tmp17 = icmp eq i32 %tmp16, 1
+ br i1 %tmp17, label %bb18, label %bb62
+
+bb18:
+ %tmp19 = extractelement <2 x i32> %tmp, i64 0
+ %tmp22 = lshr i32 %tmp19, 16
+ %tmp24 = urem i32 %tmp22, 52
+ %tmp25 = mul nuw nsw i32 %tmp24, 52
+ br label %INNER_LOOP
+
+INNER_LOOP:
+ %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ]
+ call void asm sideeffect "; inner loop body", ""() #0
+ %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1
+ %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0
+ br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP
+
+INNER_LOOP_BREAK:
+ %tmp59 = extractelement <4 x i32> %tmp14, i64 2
+ call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
+ br label %END_ELSE_BLOCK
+
+bb62:
+ %load13 = icmp ult i32 %tmp16, 271
+ br i1 %load13, label %bb64, label %INCREMENT_I
+
+bb64:
+ call void asm sideeffect "s_nop 42", "~{memory}"() #0
+ br label %RETURN
+
+INCREMENT_I:
+ %inc.i = add i32 %i, 1
+ call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0
+ br label %END_ELSE_BLOCK
+
+END_ELSE_BLOCK:
+ %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ]
+ call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0
+ %cmp.end.else.block = icmp eq i32 %i.final, -1
+ br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER
+
+RETURN:
+ call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0
+ store volatile <2 x float> %load1, <2 x float> addrspace(1)* undef, align 8
+ ret void
+}
+
+; The same function, except break to return block goes directly to the
+; return, which managed to hide the bug.
+; FIXME: Merge variant from backedge-id-bug-xfail
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { convergent nounwind readnone }
diff --git a/test/Transforms/StructurizeCFG/AMDGPU/lit.local.cfg b/test/Transforms/StructurizeCFG/AMDGPU/lit.local.cfg
new file mode 100644
index 000000000000..2a665f06be72
--- /dev/null
+++ b/test/Transforms/StructurizeCFG/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+ config.unsupported = True
diff --git a/test/Transforms/StructurizeCFG/nested-loop-order.ll b/test/Transforms/StructurizeCFG/nested-loop-order.ll
index 58634d0d37db..7b5bd5acb629 100644
--- a/test/Transforms/StructurizeCFG/nested-loop-order.ll
+++ b/test/Transforms/StructurizeCFG/nested-loop-order.ll
@@ -1,32 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -structurizecfg %s -o - | FileCheck %s
define void @main(float addrspace(1)* %out) {
-
-; CHECK: main_body:
-; CHECK: br label %LOOP.outer
+; CHECK-LABEL: @main(
+; CHECK-NEXT: main_body:
+; CHECK-NEXT: br label [[LOOP_OUTER:%.*]]
+; CHECK: LOOP.outer:
+; CHECK-NEXT: [[TEMP8_0_PH:%.*]] = phi float [ 0.000000e+00, [[MAIN_BODY:%.*]] ], [ [[TMP13:%.*]], [[FLOW3:%.*]] ]
+; CHECK-NEXT: [[TEMP4_0_PH:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP12:%.*]], [[FLOW3]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: LOOP:
+; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP12]], [[FLOW:%.*]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = phi float [ undef, [[LOOP_OUTER]] ], [ [[TMP13]], [[FLOW]] ]
+; CHECK-NEXT: [[TEMP4_0:%.*]] = phi i32 [ [[TEMP4_0_PH]], [[LOOP_OUTER]] ], [ [[TMP15:%.*]], [[FLOW]] ]
+; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TEMP4_0]], 1
+; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = xor i1 [[TMP22]], true
+; CHECK-NEXT: br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]]
+; CHECK: Flow2:
+; CHECK-NEXT: [[TMP3:%.*]] = phi float [ [[TEMP8_0_PH]], [[IF29:%.*]] ], [ [[TMP9:%.*]], [[FLOW1:%.*]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP20]], [[IF29]] ], [ undef, [[FLOW1]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP32:%.*]], [[IF29]] ], [ true, [[FLOW1]] ]
+; CHECK-NEXT: br label [[FLOW]]
+; CHECK: Flow3:
+; CHECK-NEXT: br i1 [[TMP16:%.*]], label [[ENDLOOP:%.*]], label [[LOOP_OUTER]]
+; CHECK: ENDLOOP:
+; CHECK-NEXT: [[TEMP8_1:%.*]] = phi float [ [[TMP14:%.*]], [[FLOW3]] ]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP20]], 3
+; CHECK-NEXT: [[DOT45:%.*]] = select i1 [[TMP23]], float 0.000000e+00, float 1.000000e+00
+; CHECK-NEXT: store float [[DOT45]], float addrspace(1)* [[OUT:%.*]]
+; CHECK-NEXT: ret void
+; CHECK: ENDIF:
+; CHECK-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP20]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = xor i1 [[TMP31]], true
+; CHECK-NEXT: br i1 [[TMP6]], label [[ENDIF28:%.*]], label [[FLOW1]]
+; CHECK: Flow1:
+; CHECK-NEXT: [[TMP7:%.*]] = phi i32 [ [[TMP20]], [[ENDIF28]] ], [ [[TMP0]], [[ENDIF]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = phi float [ [[TMP35:%.*]], [[ENDIF28]] ], [ [[TMP1]], [[ENDIF]] ]
+; CHECK-NEXT: [[TMP9]] = phi float [ [[TMP35]], [[ENDIF28]] ], [ [[TEMP8_0_PH]], [[ENDIF]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = phi i1 [ [[TMP36:%.*]], [[ENDIF28]] ], [ true, [[ENDIF]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = phi i1 [ false, [[ENDIF28]] ], [ true, [[ENDIF]] ]
+; CHECK-NEXT: br i1 [[TMP11]], label [[IF29]], label [[FLOW2:%.*]]
+; CHECK: IF29:
+; CHECK-NEXT: [[TMP32]] = icmp sgt i32 [[TMP20]], 2
+; CHECK-NEXT: br label [[FLOW2]]
+; CHECK: Flow:
+; CHECK-NEXT: [[TMP12]] = phi i32 [ [[TMP7]], [[FLOW2]] ], [ [[TMP0]], [[LOOP]] ]
+; CHECK-NEXT: [[TMP13]] = phi float [ [[TMP8]], [[FLOW2]] ], [ [[TMP1]], [[LOOP]] ]
+; CHECK-NEXT: [[TMP14]] = phi float [ [[TMP3]], [[FLOW2]] ], [ [[TEMP8_0_PH]], [[LOOP]] ]
+; CHECK-NEXT: [[TMP15]] = phi i32 [ [[TMP4]], [[FLOW2]] ], [ undef, [[LOOP]] ]
+; CHECK-NEXT: [[TMP16]] = phi i1 [ [[TMP10]], [[FLOW2]] ], [ true, [[LOOP]] ]
+; CHECK-NEXT: [[TMP17:%.*]] = phi i1 [ [[TMP5]], [[FLOW2]] ], [ true, [[LOOP]] ]
+; CHECK-NEXT: br i1 [[TMP17]], label [[FLOW3]], label [[LOOP]]
+; CHECK: ENDIF28:
+; CHECK-NEXT: [[TMP35]] = fadd float [[TEMP8_0_PH]], 1.000000e+00
+; CHECK-NEXT: [[TMP36]] = icmp sgt i32 [[TMP20]], 2
+; CHECK-NEXT: br label [[FLOW1]]
+;
main_body:
br label %LOOP.outer
-; CHECK: LOOP.outer:
-; CHECK: br label %LOOP
LOOP.outer: ; preds = %ENDIF28, %main_body
%temp8.0.ph = phi float [ 0.000000e+00, %main_body ], [ %tmp35, %ENDIF28 ]
%temp4.0.ph = phi i32 [ 0, %main_body ], [ %tmp20, %ENDIF28 ]
br label %LOOP
-; CHECK: LOOP:
-; br i1 %{{[0-9]+}}, label %ENDIF, label %Flow
LOOP: ; preds = %IF29, %LOOP.outer
%temp4.0 = phi i32 [ %temp4.0.ph, %LOOP.outer ], [ %tmp20, %IF29 ]
%tmp20 = add i32 %temp4.0, 1
%tmp22 = icmp sgt i32 %tmp20, 3
br i1 %tmp22, label %ENDLOOP, label %ENDIF
-; CHECK: Flow3
-; CHECK: br i1 %{{[0-9]+}}, label %ENDLOOP, label %LOOP.outer
-
-; CHECK: ENDLOOP:
-; CHECK: ret void
ENDLOOP: ; preds = %ENDIF28, %IF29, %LOOP
%temp8.1 = phi float [ %temp8.0.ph, %LOOP ], [ %temp8.0.ph, %IF29 ], [ %tmp35, %ENDIF28 ]
%tmp23 = icmp eq i32 %tmp20, 3
@@ -34,29 +78,14 @@ ENDLOOP: ; preds = %ENDIF28, %IF29, %LO
store float %.45, float addrspace(1)* %out
ret void
-; CHECK: ENDIF:
-; CHECK: br i1 %tmp31, label %IF29, label %Flow1
ENDIF: ; preds = %LOOP
%tmp31 = icmp sgt i32 %tmp20, 1
br i1 %tmp31, label %IF29, label %ENDIF28
-; CHECK: Flow:
-; CHECK: br i1 %{{[0-9]+}}, label %Flow2, label %LOOP
-
-; CHECK: IF29:
-; CHECK: br label %Flow1
IF29: ; preds = %ENDIF
%tmp32 = icmp sgt i32 %tmp20, 2
br i1 %tmp32, label %ENDLOOP, label %LOOP
-; CHECK: Flow1:
-; CHECK: br label %Flow
-
-; CHECK: Flow2:
-; CHECK: br i1 %{{[0-9]+}}, label %ENDIF28, label %Flow3
-
-; CHECK: ENDIF28:
-; CHECK: br label %Flow3
ENDIF28: ; preds = %ENDIF
%tmp35 = fadd float %temp8.0.ph, 1.0
%tmp36 = icmp sgt i32 %tmp20, 2
diff --git a/test/tools/llvm-readobj/macho-needed-libs.test b/test/tools/llvm-readobj/macho-needed-libs.test
new file mode 100644
index 000000000000..22e6948e758f
--- /dev/null
+++ b/test/tools/llvm-readobj/macho-needed-libs.test
@@ -0,0 +1,26 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: llvm-readobj -needed-libs %t.o | FileCheck %s
+
+# CHECK: NeededLibraries [
+# CHECK-NEXT: /usr/lib/libSystem.B.dylib
+# CHECK-NEXT: ]
+
+!mach-o
+FileHeader:
+ magic: 0xFEEDFACF
+ cputype: 0x01000007
+ cpusubtype: 0x00000003
+ filetype: 0x00000001
+ ncmds: 1
+ sizeofcmds: 56
+ flags: 0x00002000
+ reserved: 0x00000000
+LoadCommands:
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 56
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 81985536
+ compatibility_version: 65536
+ PayloadString: /usr/lib/libSystem.B.dylib
diff --git a/tools/llvm-readobj/MachODumper.cpp b/tools/llvm-readobj/MachODumper.cpp
index 39e909279937..64178d7b33ad 100644
--- a/tools/llvm-readobj/MachODumper.cpp
+++ b/tools/llvm-readobj/MachODumper.cpp
@@ -39,6 +39,8 @@ public:
void printUnwindInfo() override;
void printStackMap() const override;
+ void printNeededLibraries() override;
+
// MachO-specific.
void printMachODataInCode() override;
void printMachOVersionMin() override;
@@ -675,6 +677,34 @@ void MachODumper::printStackMap() const {
StackMapV2Parser<support::big>(StackMapContentsArray));
}
+void MachODumper::printNeededLibraries() {
+ ListScope D(W, "NeededLibraries");
+
+ using LibsTy = std::vector<StringRef>;
+ LibsTy Libs;
+
+ for (const auto &Command : Obj->load_commands()) {
+ if (Command.C.cmd == MachO::LC_LOAD_DYLIB ||
+ Command.C.cmd == MachO::LC_ID_DYLIB ||
+ Command.C.cmd == MachO::LC_LOAD_WEAK_DYLIB ||
+ Command.C.cmd == MachO::LC_REEXPORT_DYLIB ||
+ Command.C.cmd == MachO::LC_LAZY_LOAD_DYLIB ||
+ Command.C.cmd == MachO::LC_LOAD_UPWARD_DYLIB) {
+ MachO::dylib_command Dl = Obj->getDylibIDLoadCommand(Command);
+ if (Dl.dylib.name < Dl.cmdsize) {
+ auto *P = static_cast<const char*>(Command.Ptr) + Dl.dylib.name;
+ Libs.push_back(P);
+ }
+ }
+ }
+
+ std::stable_sort(Libs.begin(), Libs.end());
+
+ for (const auto &L : Libs) {
+ outs() << " " << L << "\n";
+ }
+}
+
void MachODumper::printMachODataInCode() {
for (const auto &Load : Obj->load_commands()) {
if (Load.C.cmd == MachO::LC_DATA_IN_CODE) {
diff --git a/unittests/IR/DominatorTreeBatchUpdatesTest.cpp b/unittests/IR/DominatorTreeBatchUpdatesTest.cpp
index 4ad1f69030c1..e362afd84048 100644
--- a/unittests/IR/DominatorTreeBatchUpdatesTest.cpp
+++ b/unittests/IR/DominatorTreeBatchUpdatesTest.cpp
@@ -258,3 +258,98 @@ TEST(DominatorTreeBatchUpdates, InsertDeleteExhaustive) {
EXPECT_TRUE(PDT.verify());
}
}
+
+// These are some odd flowgraphs, usually generated from csmith cases,
+// which are difficult on post dom trees.
+TEST(DominatorTreeBatchUpdates, InfiniteLoop) {
+ std::vector<CFGBuilder::Arc> Arcs = {
+ {"1", "2"},
+ {"2", "3"},
+ {"3", "6"}, {"3", "5"},
+ {"4", "5"},
+ {"5", "2"},
+ {"6", "3"}, {"6", "4"}};
+
+ // SplitBlock on 3 -> 5
+ std::vector<CFGBuilder::Update> Updates = {
+ {CFGInsert, {"N", "5"}}, {CFGInsert, {"3", "N"}}, {CFGDelete, {"3", "5"}}};
+
+ CFGHolder Holder;
+ CFGBuilder B(Holder.F, Arcs, Updates);
+ DominatorTree DT(*Holder.F);
+ EXPECT_TRUE(DT.verify());
+ PostDomTree PDT(*Holder.F);
+ EXPECT_TRUE(PDT.verify());
+
+ while (B.applyUpdate())
+ ;
+
+ auto DomUpdates = ToDomUpdates(B, Updates);
+ DT.applyUpdates(DomUpdates);
+ EXPECT_TRUE(DT.verify());
+ PDT.applyUpdates(DomUpdates);
+ EXPECT_TRUE(PDT.verify());
+}
+
+TEST(DominatorTreeBatchUpdates, DeadBlocks) {
+ std::vector<CFGBuilder::Arc> Arcs = {
+ {"1", "2"},
+ {"2", "3"},
+ {"3", "4"}, {"3", "7"},
+ {"4", "4"},
+ {"5", "6"}, {"5", "7"},
+ {"6", "7"},
+ {"7", "2"}, {"7", "8"}};
+
+ // Remove dead 5 and 7,
+ // plus SplitBlock on 7 -> 8
+ std::vector<CFGBuilder::Update> Updates = {
+ {CFGDelete, {"6", "7"}}, {CFGDelete, {"5", "7"}}, {CFGDelete, {"5", "6"}},
+ {CFGInsert, {"N", "8"}}, {CFGInsert, {"7", "N"}}, {CFGDelete, {"7", "8"}}};
+
+ CFGHolder Holder;
+ CFGBuilder B(Holder.F, Arcs, Updates);
+ DominatorTree DT(*Holder.F);
+ EXPECT_TRUE(DT.verify());
+ PostDomTree PDT(*Holder.F);
+ EXPECT_TRUE(PDT.verify());
+
+ while (B.applyUpdate())
+ ;
+
+ auto DomUpdates = ToDomUpdates(B, Updates);
+ DT.applyUpdates(DomUpdates);
+ EXPECT_TRUE(DT.verify());
+ PDT.applyUpdates(DomUpdates);
+ EXPECT_TRUE(PDT.verify());
+}
+
+TEST(DominatorTreeBatchUpdates, InfiniteLoop2) {
+ std::vector<CFGBuilder::Arc> Arcs = {
+ {"1", "2"},
+ {"2", "6"}, {"2", "3"},
+ {"3", "4"},
+ {"4", "5"}, {"4", "6"},
+ {"5", "4"},
+ {"6", "2"}};
+
+ // SplitBlock on 4 -> 6
+ std::vector<CFGBuilder::Update> Updates = {
+ {CFGInsert, {"N", "6"}}, {CFGInsert, {"4", "N"}}, {CFGDelete, {"4", "6"}}};
+
+ CFGHolder Holder;
+ CFGBuilder B(Holder.F, Arcs, Updates);
+ DominatorTree DT(*Holder.F);
+ EXPECT_TRUE(DT.verify());
+ PostDomTree PDT(*Holder.F);
+ EXPECT_TRUE(PDT.verify());
+
+ while (B.applyUpdate())
+ ;
+
+ auto DomUpdates = ToDomUpdates(B, Updates);
+ DT.applyUpdates(DomUpdates);
+ EXPECT_TRUE(DT.verify());
+ PDT.applyUpdates(DomUpdates);
+ EXPECT_TRUE(PDT.verify());
+}
diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp
index bf5aced49289..4666f93da2d9 100644
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp
@@ -925,3 +925,28 @@ TEST(DominatorTree, InsertDeleteExhaustive) {
}
}
}
+
+TEST(DominatorTree, InsertIntoIrreducible) {
+ std::vector<CFGBuilder::Arc> Arcs = {
+ {"0", "1"},
+ {"1", "27"}, {"1", "7"},
+ {"10", "18"},
+ {"13", "10"},
+ {"18", "13"}, {"18", "23"},
+ {"23", "13"}, {"23", "24"},
+ {"24", "1"}, {"24", "18"},
+ {"27", "24"}};
+
+ CFGHolder Holder;
+ CFGBuilder B(Holder.F, Arcs, {{Insert, {"7", "23"}}});
+ DominatorTree DT(*Holder.F);
+ EXPECT_TRUE(DT.verify());
+
+ B.applyUpdate();
+ BasicBlock *From = B.getOrAddBlock("7");
+ BasicBlock *To = B.getOrAddBlock("23");
+ DT.insertEdge(From, To);
+
+ EXPECT_TRUE(DT.verify());
+}
+
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index 66a2c578083e..440dee53c1b7 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -33,6 +33,7 @@ do_asserts="no"
do_compare="yes"
do_rt="yes"
do_libs="yes"
+do_libcxxabi="yes"
do_libunwind="yes"
do_test_suite="yes"
do_openmp="yes"
@@ -62,6 +63,7 @@ function usage() {
echo " For example -svn-path trunk or -svn-path branches/release_37"
echo " -no-rt Disable check-out & build Compiler-RT"
echo " -no-libs Disable check-out & build libcxx/libcxxabi/libunwind"
+ echo " -no-libcxxabi Disable check-out & build libcxxabi"
echo " -no-libunwind Disable check-out & build libunwind"
echo " -no-test-suite Disable check-out & build test-suite"
echo " -no-openmp Disable check-out & build libomp"
@@ -135,6 +137,9 @@ while [ $# -gt 0 ]; do
-no-libs )
do_libs="no"
;;
+ -no-libcxxabi )
+ do_libcxxabi="no"
+ ;;
-no-libunwind )
do_libunwind="no"
;;
@@ -206,7 +211,10 @@ if [ $do_rt = "yes" ]; then
projects="$projects compiler-rt"
fi
if [ $do_libs = "yes" ]; then
- projects="$projects libcxx libcxxabi"
+ projects="$projects libcxx"
+ if [ $do_libcxxabi = "yes" ]; then
+ projects="$projects libcxxabi"
+ fi
if [ $do_libunwind = "yes" ]; then
projects="$projects libunwind"
fi