aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-01-04 22:11:11 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-01-04 22:11:11 +0000
commitc82ad72f63369bc462e59458f09960d66daa58a9 (patch)
tree58bc455a8d052220f9ae11e65d6f06d671a7a4c4
parentb915e9e0fc85ba6f398b3fab0db6a81a8913af94 (diff)
downloadsrc-vendor/llvm/llvm-trunk-r291012.tar.gz
src-vendor/llvm/llvm-trunk-r291012.zip
Vendor import of llvm trunk r291012:vendor/llvm/llvm-trunk-r291012
-rwxr-xr-xcmake/config-ix.cmake7
-rw-r--r--cmake/modules/CheckCompilerVersion.cmake4
-rw-r--r--include/llvm/ADT/IntrusiveRefCntPtr.h7
-rw-r--r--include/llvm/ADT/PriorityWorklist.h39
-rw-r--r--include/llvm/Analysis/Loads.h18
-rw-r--r--include/llvm/CodeGen/AsmPrinter.h5
-rw-r--r--include/llvm/CodeGen/MachineDominators.h4
-rw-r--r--include/llvm/DebugInfo/DWARF/DWARFDebugLine.h6
-rw-r--r--include/llvm/IR/IntrinsicsAMDGPU.td7
-rw-r--r--include/llvm/IR/IntrinsicsX86.td124
-rw-r--r--include/llvm/Support/FileSystem.h18
-rw-r--r--include/llvm/Support/YAMLTraits.h98
-rw-r--r--lib/Analysis/ValueTracking.cpp6
-rw-r--r--lib/Bitcode/Reader/MetadataLoader.cpp2
-rw-r--r--lib/CodeGen/Analysis.cpp26
-rw-r--r--lib/CodeGen/AsmPrinter/AsmPrinter.cpp57
-rw-r--r--lib/CodeGen/InlineSpiller.cpp8
-rw-r--r--lib/CodeGen/SelectionDAG/DAGCombiner.cpp18
-rw-r--r--lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp16
-rw-r--r--lib/Fuzzer/FuzzerTracePC.cpp2
-rw-r--r--lib/IR/AutoUpgrade.cpp64
-rw-r--r--lib/LTO/LTO.cpp12
-rw-r--r--lib/Support/APFloat.cpp8
-rw-r--r--lib/Support/Host.cpp1
-rw-r--r--lib/Support/NativeFormatting.cpp5
-rw-r--r--lib/Support/YAMLTraits.cpp26
-rw-r--r--lib/TableGen/StringMatcher.cpp19
-rw-r--r--lib/Target/AArch64/AArch64.td6
-rw-r--r--lib/Target/AArch64/AArch64AsmPrinter.cpp56
-rw-r--r--lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp3
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp1
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h1
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td4
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp9
-rw-r--r--lib/Target/AMDGPU/SIInsertWaits.cpp5
-rw-r--r--lib/Target/AMDGPU/SOPInstructions.td5
-rw-r--r--lib/Target/ARM/ARMAsmPrinter.cpp3
-rw-r--r--lib/Target/ARM/ARMAsmPrinter.h3
-rw-r--r--lib/Target/ARM/ARMMCInstLower.cpp38
-rw-r--r--lib/Target/Hexagon/BitTracker.cpp70
-rw-r--r--lib/Target/Hexagon/BitTracker.h53
-rw-r--r--lib/Target/Hexagon/HexagonBitTracker.cpp52
-rw-r--r--lib/Target/Hexagon/HexagonBitTracker.h22
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.cpp181
-rw-r--r--lib/Target/Hexagon/HexagonInstrInfo.h19
-rw-r--r--lib/Target/Hexagon/HexagonMachineFunctionInfo.h27
-rw-r--r--lib/Target/Hexagon/HexagonTargetObjectFile.cpp39
-rw-r--r--lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp45
-rw-r--r--lib/Target/Hexagon/RDFCopy.h19
-rw-r--r--lib/Target/Hexagon/RDFGraph.cpp60
-rw-r--r--lib/Target/Hexagon/RDFGraph.h99
-rw-r--r--lib/Target/Mips/MipsSEISelDAGToDAG.cpp6
-rw-r--r--lib/Target/Mips/MipsSEISelDAGToDAG.h2
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp79
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp2
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp22
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp232
-rw-r--r--lib/Target/X86/X86InstrAVX512.td43
-rw-r--r--lib/Target/X86/X86InstrSSE.td53
-rwxr-xr-xlib/Target/X86/X86InstrTablesInfo.h90
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h26
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp50
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp220
-rw-r--r--lib/Transforms/InstCombine/InstCombineAddSub.cpp12
-rw-r--r--lib/Transforms/InstCombine/InstCombineCalls.cpp82
-rw-r--r--lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp18
-rw-r--r--lib/Transforms/InstCombine/InstCombineShifts.cpp19
-rw-r--r--lib/Transforms/Scalar/EarlyCSE.cpp4
-rw-r--r--lib/Transforms/Scalar/NewGVN.cpp60
-rw-r--r--lib/Transforms/Utils/InlineFunction.cpp2
-rw-r--r--lib/Transforms/Utils/LoopUnrollPeel.cpp25
-rw-r--r--lib/Transforms/Utils/SimplifyCFG.cpp10
-rw-r--r--runtimes/CMakeLists.txt6
-rw-r--r--test/Analysis/CostModel/X86/alternate-shuffle-cost.ll44
-rw-r--r--test/Analysis/RegionInfo/bad_node_traversal.ll43
-rw-r--r--test/Bitcode/DIGlobalVariableExpression.ll3
-rw-r--r--test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll7
-rw-r--r--test/CodeGen/AArch64/store_merge_pair_offset.ll12
-rw-r--r--test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll41
-rw-r--r--test/CodeGen/AMDGPU/amdgcn.sendmsg.ll161
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll17
-rw-r--r--test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll24
-rw-r--r--test/CodeGen/PowerPC/ppc64-blnop.ll129
-rw-r--r--test/CodeGen/PowerPC/ppc64-sibcall.ll8
-rw-r--r--test/CodeGen/SPARC/soft-float.ll6
-rw-r--r--test/CodeGen/X86/MergeConsecutiveStores.ll34
-rw-r--r--test/CodeGen/X86/avx2-vbroadcast.ll233
-rw-r--r--test/CodeGen/X86/avx512-any_extend_load.ll6
-rw-r--r--test/CodeGen/X86/avx512-extract-subvector.ll12
-rw-r--r--test/CodeGen/X86/avx512-insert-extract.ll32
-rw-r--r--test/CodeGen/X86/avx512-intrinsics-upgrade.ll184
-rw-r--r--test/CodeGen/X86/avx512-intrinsics.ll127
-rw-r--r--test/CodeGen/X86/avx512-skx-insert-subvec.ll6
-rw-r--r--test/CodeGen/X86/avx512-vbroadcasti128.ll6
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics.ll8
-rw-r--r--test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll136
-rw-r--r--test/CodeGen/X86/avx512dq-intrinsics.ll121
-rw-r--r--test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll59
-rw-r--r--test/CodeGen/X86/avx512dqvl-intrinsics.ll60
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll60
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics.ll61
-rw-r--r--test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll72
-rw-r--r--test/CodeGen/X86/frame-lowering-debug-intrinsic.ll41
-rw-r--r--test/CodeGen/X86/i64-to-float.ll20
-rw-r--r--test/CodeGen/X86/masked_memop.ll56
-rw-r--r--test/CodeGen/X86/stack-folding-fp-avx512vl.ll8
-rw-r--r--test/CodeGen/X86/stack-folding-int-avx512vl.ll8
-rw-r--r--test/CodeGen/X86/subvector-broadcast.ll263
-rw-r--r--test/CodeGen/X86/vec_fp_to_int.ll158
-rw-r--r--test/CodeGen/X86/vec_int_to_fp.ll28
-rw-r--r--test/CodeGen/X86/vector-half-conversions.ll64
-rw-r--r--test/CodeGen/X86/vector-lzcnt-256.ll86
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v16.ll834
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v32.ll90
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v4.ll68
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v8.ll530
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v16.ll72
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v8.ll196
-rw-r--r--test/CodeGen/X86/vector-trunc-math.ll1463
-rw-r--r--test/CodeGen/X86/vector-trunc.ll6
-rw-r--r--test/DebugInfo/Generic/simplifycfg_sink_last_inst.ll70
-rw-r--r--test/DebugInfo/X86/dbg-value-frame-index.ll39
-rw-r--r--test/MC/ARM/coff-relocations.s2
-rw-r--r--test/ThinLTO/X86/drop-debug-info.ll4
-rw-r--r--test/Transforms/Inline/inline-invoke-tail.ll2
-rw-r--r--test/Transforms/InstCombine/add.ll12
-rw-r--r--test/Transforms/InstCombine/assume.ll63
-rw-r--r--test/Transforms/InstCombine/fabs.ll48
-rw-r--r--test/Transforms/InstCombine/fma.ll203
-rw-r--r--test/Transforms/InstCombine/rem.ll10
-rw-r--r--test/Transforms/InstCombine/shift.ll12
-rw-r--r--test/Transforms/InstCombine/sink-zext.ll71
-rw-r--r--test/Transforms/LoopIdiom/basic.ll8
-rw-r--r--test/Transforms/LoopUnroll/peel-loop-pgo.ll2
-rw-r--r--test/Transforms/NewGVN/equivalent-phi.ll68
-rw-r--r--test/Transforms/NewGVN/pr31483.ll106
-rw-r--r--test/Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll21
-rw-r--r--test/Transforms/PartiallyInlineLibCalls/X86/lit.local.cfg2
-rw-r--r--test/Transforms/SLPVectorizer/X86/horizontal-list.ll15
-rw-r--r--test/tools/gold/X86/Inputs/thinlto.ll1
-rw-r--r--test/tools/gold/X86/Inputs/thinlto_archive1.ll1
-rw-r--r--test/tools/gold/X86/Inputs/thinlto_archive2.ll1
-rw-r--r--test/tools/gold/X86/comdat.ll2
-rw-r--r--test/tools/gold/X86/opt-level.ll4
-rw-r--r--test/tools/gold/X86/pr25907.ll2
-rw-r--r--test/tools/gold/X86/stats.ll1
-rw-r--r--test/tools/gold/X86/strip_names.ll3
-rw-r--r--test/tools/gold/X86/thinlto.ll8
-rw-r--r--test/tools/gold/X86/thinlto_afdo.ll2
-rw-r--r--test/tools/gold/X86/thinlto_archive.ll2
-rw-r--r--test/tools/gold/X86/type-merge2.ll2
-rw-r--r--test/tools/gold/X86/visibility.ll2
-rw-r--r--tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp14
-rw-r--r--tools/llvm-link/CMakeLists.txt1
-rw-r--r--tools/llvm-link/LLVMBuild.txt2
-rw-r--r--tools/llvm-link/llvm-link.cpp51
-rw-r--r--unittests/ADT/PriorityWorklistTest.cpp47
-rw-r--r--unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp75
-rw-r--r--unittests/Support/YAMLIOTest.cpp62
159 files changed, 4891 insertions, 4343 deletions
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index fe3afd3fcc26..530a5ddaab4d 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -457,6 +457,13 @@ if( MSVC )
if(LLVM_ENABLE_DIA_SDK AND NOT HAVE_DIA_SDK)
message(FATAL_ERROR "DIA SDK not found. If you have both VS 2012 and 2013 installed, you may need to uninstall the former and re-install the latter afterwards.")
endif()
+
+ # Normalize to 0/1 for lit.site.cfg
+ if(LLVM_ENABLE_DIA_SDK)
+ set(LLVM_ENABLE_DIA_SDK 1)
+ else()
+ set(LLVM_ENABLE_DIA_SDK 0)
+ endif()
else()
set(LLVM_ENABLE_DIA_SDK 0)
endif( MSVC )
diff --git a/cmake/modules/CheckCompilerVersion.cmake b/cmake/modules/CheckCompilerVersion.cmake
index cdad7ce27651..2e8f5445781c 100644
--- a/cmake/modules/CheckCompilerVersion.cmake
+++ b/cmake/modules/CheckCompilerVersion.cmake
@@ -43,8 +43,8 @@ int main() { return (float)x; }"
elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.0)
message(FATAL_ERROR "Host Visual Studio must be at least 2015")
- elseif(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.00.24215.1)
- message(WARNING "Host Visual Studio should at least be 2015 Update 3 (MSVC 19.00.24215.1)"
+ elseif(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.00.24213.1)
+ message(WARNING "Host Visual Studio should at least be 2015 Update 3 (MSVC 19.00.24213.1)"
" due to miscompiles from earlier versions")
endif()
endif()
diff --git a/include/llvm/ADT/IntrusiveRefCntPtr.h b/include/llvm/ADT/IntrusiveRefCntPtr.h
index 559fb40773aa..a77cf04ea4d1 100644
--- a/include/llvm/ADT/IntrusiveRefCntPtr.h
+++ b/include/llvm/ADT/IntrusiveRefCntPtr.h
@@ -21,8 +21,8 @@
// class MyClass : public RefCountedBase<MyClass> {};
//
// void foo() {
-// // Objects that inherit from RefCountedBase should always be instantiated
-// // on the heap, never on the stack.
+// // Constructing an IntrusiveRefCntPtr increases the pointee's refcount by
+// // 1 (from 0 in this case).
// IntrusiveRefCntPtr<MyClass> Ptr1(new MyClass());
//
// // Copying an IntrusiveRefCntPtr increases the pointee's refcount by 1.
@@ -68,9 +68,6 @@ namespace llvm {
/// calls to Release() and Retain(), which increment and decrement the object's
/// refcount, respectively. When a Release() call decrements the refcount to 0,
/// the object deletes itself.
-///
-/// Objects that inherit from RefCountedBase should always be allocated with
-/// operator new.
template <class Derived> class RefCountedBase {
mutable unsigned RefCount = 0;
diff --git a/include/llvm/ADT/PriorityWorklist.h b/include/llvm/ADT/PriorityWorklist.h
index c0b4709e98f8..3198dd438700 100644
--- a/include/llvm/ADT/PriorityWorklist.h
+++ b/include/llvm/ADT/PriorityWorklist.h
@@ -18,6 +18,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Compiler.h"
#include <algorithm>
@@ -107,6 +108,39 @@ public:
return false;
}
+ /// Insert a sequence of new elements into the PriorityWorklist.
+ template <typename SequenceT>
+ typename std::enable_if<!std::is_convertible<SequenceT, T>::value>::type
+ insert(SequenceT &&Input) {
+ if (std::begin(Input) == std::end(Input))
+ // Nothing to do for an empty input sequence.
+ return;
+
+ // First pull the input sequence into the vector as a bulk append
+ // operation.
+ ptrdiff_t StartIndex = V.size();
+ V.insert(V.end(), std::begin(Input), std::end(Input));
+ // Now walk backwards fixing up the index map and deleting any duplicates.
+ for (ptrdiff_t i = V.size() - 1; i >= StartIndex; --i) {
+ auto InsertResult = M.insert({V[i], i});
+ if (InsertResult.second)
+ continue;
+
+ // If the existing index is before this insert's start, nuke that one and
+ // move it up.
+ ptrdiff_t &Index = InsertResult.first->second;
+ if (Index < StartIndex) {
+ V[Index] = T();
+ Index = i;
+ continue;
+ }
+
+ // Otherwise the existing one comes first so just clear out the value in
+ // this slot.
+ V[i] = T();
+ }
+ }
+
/// Remove the last element of the PriorityWorklist.
void pop_back() {
assert(!empty() && "Cannot remove an element when empty!");
@@ -169,6 +203,11 @@ public:
return true;
}
+ /// Reverse the items in the PriorityWorklist.
+ ///
+ /// This does an in-place reversal. Other kinds of reverse aren't easy to
+ /// support in the face of the worklist semantics.
+
/// Completely clear the PriorityWorklist
void clear() {
M.clear();
diff --git a/include/llvm/Analysis/Loads.h b/include/llvm/Analysis/Loads.h
index 139bf3c2116f..e167f36219d2 100644
--- a/include/llvm/Analysis/Loads.h
+++ b/include/llvm/Analysis/Loads.h
@@ -23,10 +23,9 @@ namespace llvm {
class DataLayout;
class MDNode;
-/// isDereferenceablePointer - Return true if this is always a dereferenceable
-/// pointer. If the context instruction is specified perform context-sensitive
-/// analysis and return true if the pointer is dereferenceable at the
-/// specified instruction.
+/// Return true if this is always a dereferenceable pointer. If the context
+/// instruction is specified perform context-sensitive analysis and return true
+/// if the pointer is dereferenceable at the specified instruction.
bool isDereferenceablePointer(const Value *V, const DataLayout &DL,
const Instruction *CtxI = nullptr,
const DominatorTree *DT = nullptr);
@@ -40,8 +39,7 @@ bool isDereferenceableAndAlignedPointer(const Value *V, unsigned Align,
const Instruction *CtxI = nullptr,
const DominatorTree *DT = nullptr);
-/// isSafeToLoadUnconditionally - Return true if we know that executing a load
-/// from this value cannot trap.
+/// Return true if we know that executing a load from this value cannot trap.
///
/// If DT and ScanFrom are specified this method performs context-sensitive
/// analysis and returns true if it is safe to load immediately before ScanFrom.
@@ -54,12 +52,12 @@ bool isSafeToLoadUnconditionally(Value *V, unsigned Align,
Instruction *ScanFrom = nullptr,
const DominatorTree *DT = nullptr);
-/// DefMaxInstsToScan - the default number of maximum instructions
-/// to scan in the block, used by FindAvailableLoadedValue().
+/// The default number of maximum instructions to scan in the block, used by
+/// FindAvailableLoadedValue().
extern cl::opt<unsigned> DefMaxInstsToScan;
-/// \brief Scan backwards to see if we have the value of the given load
-/// available locally within a small number of instructions.
+/// Scan backwards to see if we have the value of the given load available
+/// locally within a small number of instructions.
///
/// You can use this function to scan across multiple blocks: after you call
/// this function, if ScanFrom points at the beginning of the block, it's safe
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index c1be46ddd7b5..be8822df3dba 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -208,6 +208,8 @@ public:
SledKind Kind;
bool AlwaysInstrument;
const class Function *Fn;
+
+ void emit(int, MCStreamer *, const MCSymbol *) const;
};
// All the sleds to be emitted.
@@ -216,6 +218,9 @@ public:
// Helper function to record a given XRay sled.
void recordSled(MCSymbol *Sled, const MachineInstr &MI, SledKind Kind);
+ /// Emit a table with all XRay instrumentation points.
+ void emitXRayTable();
+
//===------------------------------------------------------------------===//
// MachineFunctionPass Implementation.
//===------------------------------------------------------------------===//
diff --git a/include/llvm/CodeGen/MachineDominators.h b/include/llvm/CodeGen/MachineDominators.h
index 76e1df89169e..21ecef587aa5 100644
--- a/include/llvm/CodeGen/MachineDominators.h
+++ b/include/llvm/CodeGen/MachineDominators.h
@@ -59,6 +59,9 @@ class MachineDominatorTree : public MachineFunctionPass {
/// such as BB == elt.NewBB.
mutable SmallSet<MachineBasicBlock *, 32> NewBBs;
+ /// The DominatorTreeBase that is used to compute a normal dominator tree
+ DominatorTreeBase<MachineBasicBlock>* DT;
+
/// \brief Apply all the recorded critical edges to the DT.
/// This updates the underlying DT information in a way that uses
/// the fast query path of DT as much as possible.
@@ -68,7 +71,6 @@ class MachineDominatorTree : public MachineFunctionPass {
public:
static char ID; // Pass ID, replacement for typeid
- DominatorTreeBase<MachineBasicBlock>* DT;
MachineDominatorTree();
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
index ca9a6c822876..878f1c76ebf6 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
@@ -116,12 +116,12 @@ public:
// An unsigned integer indicating the identity of the source file
// corresponding to a machine instruction.
uint16_t File;
- // An unsigned integer whose value encodes the applicable instruction set
- // architecture for the current instruction.
- uint8_t Isa;
// An unsigned integer representing the DWARF path discriminator value
// for this location.
uint32_t Discriminator;
+ // An unsigned integer whose value encodes the applicable instruction set
+ // architecture for the current instruction.
+ uint8_t Isa;
// A boolean indicating that the current instruction is the beginning of a
// statement.
uint8_t IsStmt:1,
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td
index 078959ce15d0..07d5b5ea40dc 100644
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -104,6 +104,13 @@ def int_amdgcn_dispatch_id :
// Instruction Intrinsics
//===----------------------------------------------------------------------===//
+// The first parameter is s_sendmsg immediate (i16),
+// the second one is copied to m0
+def int_amdgcn_s_sendmsg : GCCBuiltin<"__builtin_amdgcn_s_sendmsg">,
+ Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
+def int_amdgcn_s_sendmsghalt : GCCBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
+ Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
+
def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
Intrinsic<[], [], [IntrConvergent]>;
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 3a496cb6645c..85966af9c820 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -2063,130 +2063,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
}
-// Vector extract and insert
-let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
- def int_x86_avx512_mask_vextractf32x4_512 :
- GCCBuiltin<"__builtin_ia32_extractf32x4_mask">,
- Intrinsic<[llvm_v4f32_ty], [llvm_v16f32_ty, llvm_i32_ty,
- llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vextracti32x4_512 :
- GCCBuiltin<"__builtin_ia32_extracti32x4_mask">,
- Intrinsic<[llvm_v4i32_ty], [llvm_v16i32_ty, llvm_i32_ty,
- llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vextractf32x4_256 :
- GCCBuiltin<"__builtin_ia32_extractf32x4_256_mask">,
- Intrinsic<[llvm_v4f32_ty], [llvm_v8f32_ty, llvm_i32_ty,
- llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vextracti32x4_256 :
- GCCBuiltin<"__builtin_ia32_extracti32x4_256_mask">,
- Intrinsic<[llvm_v4i32_ty], [llvm_v8i32_ty, llvm_i32_ty,
- llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vextractf64x2_256 :
- GCCBuiltin<"__builtin_ia32_extractf64x2_256_mask">,
- Intrinsic<[llvm_v2f64_ty], [llvm_v4f64_ty, llvm_i32_ty,
- llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vextracti64x2_256 :
- GCCBuiltin<"__builtin_ia32_extracti64x2_256_mask">,
- Intrinsic<[llvm_v2i64_ty], [llvm_v4i64_ty, llvm_i32_ty,
- llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vextractf64x2_512 :
- GCCBuiltin<"__builtin_ia32_extractf64x2_512_mask">,
- Intrinsic<[llvm_v2f64_ty], [llvm_v8f64_ty, llvm_i32_ty,
- llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vextracti64x2_512 :
- GCCBuiltin<"__builtin_ia32_extracti64x2_512_mask">,
- Intrinsic<[llvm_v2i64_ty], [llvm_v8i64_ty, llvm_i32_ty,
- llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vextractf32x8_512 :
- GCCBuiltin<"__builtin_ia32_extractf32x8_mask">,
- Intrinsic<[llvm_v8f32_ty], [llvm_v16f32_ty, llvm_i32_ty,
- llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vextracti32x8_512 :
- GCCBuiltin<"__builtin_ia32_extracti32x8_mask">,
- Intrinsic<[llvm_v8i32_ty],[llvm_v16i32_ty, llvm_i32_ty,
- llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vextractf64x4_512 :
- GCCBuiltin<"__builtin_ia32_extractf64x4_mask">,
- Intrinsic<[llvm_v4f64_ty], [llvm_v8f64_ty, llvm_i32_ty,
- llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vextracti64x4_512 :
- GCCBuiltin<"__builtin_ia32_extracti64x4_mask">,
- Intrinsic<[llvm_v4i64_ty], [llvm_v8i64_ty, llvm_i32_ty,
- llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-
- def int_x86_avx512_mask_insertf32x4_256 :
- GCCBuiltin<"__builtin_ia32_insertf32x4_256_mask">,
- Intrinsic<[llvm_v8f32_ty],
- [llvm_v8f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v8f32_ty, llvm_i8_ty],
- [IntrNoMem]>;
-
- def int_x86_avx512_mask_insertf32x4_512 :
- GCCBuiltin<"__builtin_ia32_insertf32x4_mask">,
- Intrinsic<[llvm_v16f32_ty],
- [llvm_v16f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v16f32_ty, llvm_i16_ty],
- [IntrNoMem]>;
-
- def int_x86_avx512_mask_insertf32x8_512 :
- GCCBuiltin<"__builtin_ia32_insertf32x8_mask">,
- Intrinsic<[llvm_v16f32_ty],
- [llvm_v16f32_ty, llvm_v8f32_ty, llvm_i32_ty, llvm_v16f32_ty, llvm_i16_ty],
- [IntrNoMem]>;
-
- def int_x86_avx512_mask_insertf64x2_256 :
- GCCBuiltin<"__builtin_ia32_insertf64x2_256_mask">,
- Intrinsic<[llvm_v4f64_ty],
- [llvm_v4f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_v4f64_ty, llvm_i8_ty],
- [IntrNoMem]>;
-
- def int_x86_avx512_mask_insertf64x2_512 :
- GCCBuiltin<"__builtin_ia32_insertf64x2_512_mask">,
- Intrinsic<[llvm_v8f64_ty],
- [llvm_v8f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_v8f64_ty, llvm_i8_ty],
- [IntrNoMem]>;
-
- def int_x86_avx512_mask_insertf64x4_512 :
- GCCBuiltin<"__builtin_ia32_insertf64x4_mask">,
- Intrinsic<[llvm_v8f64_ty],
- [llvm_v8f64_ty, llvm_v4f64_ty, llvm_i32_ty, llvm_v8f64_ty, llvm_i8_ty],
- [IntrNoMem]>;
-
- def int_x86_avx512_mask_inserti32x4_256 :
- GCCBuiltin<"__builtin_ia32_inserti32x4_256_mask">,
- Intrinsic<[llvm_v8i32_ty],
- [llvm_v8i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_v8i32_ty, llvm_i8_ty],
- [IntrNoMem]>;
-
- def int_x86_avx512_mask_inserti32x4_512 :
- GCCBuiltin<"__builtin_ia32_inserti32x4_mask">,
- Intrinsic<[llvm_v16i32_ty],
- [llvm_v16i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty],
- [IntrNoMem]>;
-
- def int_x86_avx512_mask_inserti32x8_512 :
- GCCBuiltin<"__builtin_ia32_inserti32x8_mask">,
- Intrinsic<[llvm_v16i32_ty],
- [llvm_v16i32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty],
- [IntrNoMem]>;
-
- def int_x86_avx512_mask_inserti64x2_256 :
- GCCBuiltin<"__builtin_ia32_inserti64x2_256_mask">,
- Intrinsic<[llvm_v4i64_ty],
- [llvm_v4i64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_v4i64_ty, llvm_i8_ty],
- [IntrNoMem]>;
-
- def int_x86_avx512_mask_inserti64x2_512 :
- GCCBuiltin<"__builtin_ia32_inserti64x2_512_mask">,
- Intrinsic<[llvm_v8i64_ty],
- [llvm_v8i64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_v8i64_ty, llvm_i8_ty],
- [IntrNoMem]>;
-
- def int_x86_avx512_mask_inserti64x4_512 :
- GCCBuiltin<"__builtin_ia32_inserti64x4_mask">,
- Intrinsic<[llvm_v8i64_ty],
- [llvm_v8i64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_v8i64_ty, llvm_i8_ty],
- [IntrNoMem]>;
-}
-
// Conditional load ops
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx2_maskload_d : GCCBuiltin<"__builtin_ia32_maskloadd">,
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index 9d8d8c3ffb5c..347f21108913 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -769,17 +769,13 @@ namespace detail {
std::error_code directory_iterator_increment(DirIterState &);
std::error_code directory_iterator_destruct(DirIterState &);
- /// DirIterState - Keeps state for the directory_iterator. It is reference
- /// counted in order to preserve InputIterator semantics on copy.
- struct DirIterState : public RefCountedBase<DirIterState> {
- DirIterState()
- : IterationHandle(0) {}
-
+ /// Keeps state for the directory_iterator.
+ struct DirIterState {
~DirIterState() {
directory_iterator_destruct(*this);
}
- intptr_t IterationHandle;
+ intptr_t IterationHandle = 0;
directory_entry CurrentEntry;
};
} // end namespace detail
@@ -788,23 +784,23 @@ namespace detail {
/// operator++ because we need an error_code. If it's really needed we can make
/// it call report_fatal_error on error.
class directory_iterator {
- IntrusiveRefCntPtr<detail::DirIterState> State;
+ std::shared_ptr<detail::DirIterState> State;
public:
explicit directory_iterator(const Twine &path, std::error_code &ec) {
- State = new detail::DirIterState;
+ State = std::make_shared<detail::DirIterState>();
SmallString<128> path_storage;
ec = detail::directory_iterator_construct(*State,
path.toStringRef(path_storage));
}
explicit directory_iterator(const directory_entry &de, std::error_code &ec) {
- State = new detail::DirIterState;
+ State = std::make_shared<detail::DirIterState>();
ec = detail::directory_iterator_construct(*State, de.path());
}
/// Construct end iterator.
- directory_iterator() : State(nullptr) {}
+ directory_iterator() = default;
// No operator++ because we need error_code.
directory_iterator &increment(std::error_code &ec) {
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index 38acb36942bc..cbba9c08275a 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -209,6 +209,15 @@ struct DocumentListTraits {
// static T::value_type& element(IO &io, T &seq, size_t index);
};
+/// This class should be specialized by any type that needs to be converted
+/// to/from a YAML mapping in the case where the names of the keys are not known
+/// in advance, e.g. a string map.
+template <typename T>
+struct CustomMappingTraits {
+ // static void inputOne(IO &io, StringRef key, T &elem);
+ // static void output(IO &io, T &elem);
+};
+
// Only used for better diagnostics of missing traits
template <typename T>
struct MissingTrait;
@@ -358,6 +367,23 @@ public:
static bool const value = (sizeof(test<SequenceTraits<T>>(nullptr)) == 1);
};
+// Test if CustomMappingTraits<T> is defined on type T.
+template <class T>
+struct has_CustomMappingTraits
+{
+ typedef void (*Signature_input)(IO &io, StringRef key, T &v);
+
+ template <typename U>
+ static char test(SameType<Signature_input, &U::inputOne>*);
+
+ template <typename U>
+ static double test(...);
+
+public:
+ static bool const value =
+ (sizeof(test<CustomMappingTraits<T>>(nullptr)) == 1);
+};
+
// has_FlowTraits<int> will cause an error with some compilers because
// it subclasses int. Using this wrapper only instantiates the
// real has_FlowTraits only if the template type is a class.
@@ -493,6 +519,7 @@ struct missingTraits
!has_BlockScalarTraits<T>::value &&
!has_MappingTraits<T, Context>::value &&
!has_SequenceTraits<T>::value &&
+ !has_CustomMappingTraits<T>::value &&
!has_DocumentListTraits<T>::value> {};
template <typename T, typename Context>
@@ -531,6 +558,7 @@ public:
virtual void endMapping() = 0;
virtual bool preflightKey(const char*, bool, bool, bool &, void *&) = 0;
virtual void postflightKey(void*) = 0;
+ virtual std::vector<StringRef> keys() = 0;
virtual void beginFlowMapping() = 0;
virtual void endFlowMapping() = 0;
@@ -819,6 +847,21 @@ yamlize(IO &io, T &Val, bool, Context &Ctx) {
}
template <typename T>
+typename std::enable_if<has_CustomMappingTraits<T>::value, void>::type
+yamlize(IO &io, T &Val, bool, EmptyContext &Ctx) {
+ if ( io.outputting() ) {
+ io.beginMapping();
+ CustomMappingTraits<T>::output(io, Val);
+ io.endMapping();
+ } else {
+ io.beginMapping();
+ for (StringRef key : io.keys())
+ CustomMappingTraits<T>::inputOne(io, key, Val);
+ io.endMapping();
+ }
+}
+
+template <typename T>
typename std::enable_if<missingTraits<T, EmptyContext>::value, void>::type
yamlize(IO &io, T &Val, bool, EmptyContext &Ctx) {
char missing_yaml_trait_for_type[sizeof(MissingTrait<T>)];
@@ -1074,6 +1117,7 @@ private:
void endMapping() override;
bool preflightKey(const char *, bool, bool, bool &, void *&) override;
void postflightKey(void *) override;
+ std::vector<StringRef> keys() override;
void beginFlowMapping() override;
void endFlowMapping() override;
unsigned beginSequence() override;
@@ -1154,10 +1198,8 @@ private:
typedef llvm::StringMap<std::unique_ptr<HNode>> NameToNode;
- bool isValidKey(StringRef key);
-
NameToNode Mapping;
- llvm::SmallVector<const char*, 6> ValidKeys;
+ llvm::SmallVector<std::string, 6> ValidKeys;
};
class SequenceHNode : public HNode {
@@ -1215,6 +1257,7 @@ public:
void endMapping() override;
bool preflightKey(const char *key, bool, bool, bool &, void *&) override;
void postflightKey(void *) override;
+ std::vector<StringRef> keys() override;
void beginFlowMapping() override;
void endFlowMapping() override;
unsigned beginSequence() override;
@@ -1384,6 +1427,17 @@ operator>>(Input &In, T &Val) {
return In;
}
+// Define non-member operator>> so that Input can stream in a string map.
+template <typename T>
+inline
+typename std::enable_if<has_CustomMappingTraits<T>::value, Input &>::type
+operator>>(Input &In, T &Val) {
+ EmptyContext Ctx;
+ if (In.setCurrentDocument())
+ yamlize(In, Val, true, Ctx);
+ return In;
+}
+
// Provide better error message about types missing a trait specialization
template <typename T>
inline typename std::enable_if<missingTraits<T, EmptyContext>::value,
@@ -1457,6 +1511,21 @@ operator<<(Output &Out, T &Val) {
return Out;
}
+// Define non-member operator<< so that Output can stream out a string map.
+template <typename T>
+inline
+typename std::enable_if<has_CustomMappingTraits<T>::value, Output &>::type
+operator<<(Output &Out, T &Val) {
+ EmptyContext Ctx;
+ Out.beginDocuments();
+ if (Out.preflightDocument(0)) {
+ yamlize(Out, Val, true, Ctx);
+ Out.postflightDocument();
+ }
+ Out.endDocuments();
+ return Out;
+}
+
// Provide better error message about types missing a trait specialization
template <typename T>
inline typename std::enable_if<missingTraits<T, EmptyContext>::value,
@@ -1476,6 +1545,18 @@ template <typename T> struct SequenceTraitsImpl {
}
};
+/// Implementation of CustomMappingTraits for std::map<std::string, T>.
+template <typename T> struct StdMapStringCustomMappingTraitsImpl {
+ typedef std::map<std::string, T> map_type;
+ static void inputOne(IO &io, StringRef key, map_type &v) {
+ io.mapRequired(key.str().c_str(), v[key]);
+ }
+ static void output(IO &io, map_type &v) {
+ for (auto &p : v)
+ io.mapRequired(p.first.c_str(), p.second);
+ }
+};
+
} // end namespace yaml
} // end namespace llvm
@@ -1530,4 +1611,15 @@ template <typename T> struct SequenceTraitsImpl {
} \
}
+/// Utility for declaring that std::map<std::string, _type> should be considered
+/// a YAML map.
+#define LLVM_YAML_IS_STRING_MAP(_type) \
+ namespace llvm { \
+ namespace yaml { \
+ template <> \
+ struct CustomMappingTraits<std::map<std::string, _type>> \
+ : public StdMapStringCustomMappingTraitsImpl<_type> {}; \
+ } \
+ }
+
#endif // LLVM_SUPPORT_YAMLTRAITS_H
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 2a77baec6c36..073b4e6ab26a 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -2542,9 +2542,6 @@ bool llvm::CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
return !CFP->getValueAPF().isNegZero();
- // FIXME: Magic number! At the least, this should be given a name because it's
- // used similarly in CannotBeOrderedLessThanZero(). A better fix may be to
- // expose it as a parameter, so it can be used for testing / experimenting.
if (Depth == MaxDepth)
return false; // Limit search depth.
@@ -2589,9 +2586,6 @@ bool llvm::CannotBeOrderedLessThanZero(const Value *V,
if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
return !CFP->getValueAPF().isNegative() || CFP->getValueAPF().isZero();
- // FIXME: Magic number! At the least, this should be given a name because it's
- // used similarly in CannotBeNegativeZero(). A better fix may be to
- // expose it as a parameter, so it can be used for testing / experimenting.
if (Depth == MaxDepth)
return false; // Limit search depth.
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index cd08268d47b5..5da421a79b7b 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -749,7 +749,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
// handles the case where this is type ODRed with a definition needed
// by the importing module, in which case the existing definition is
// used.
- if (IsImporting && !ImportFullTypeDefinitions &&
+ if (IsImporting && !ImportFullTypeDefinitions && Identifier &&
(Tag == dwarf::DW_TAG_enumeration_type ||
Tag == dwarf::DW_TAG_class_type ||
Tag == dwarf::DW_TAG_structure_type ||
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 0678bce449ed..79ecc4308fe7 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -272,28 +272,10 @@ static const Value *getNoopInput(const Value *V,
TLI.allowTruncateForTailCall(Op->getType(), I->getType())) {
DataBits = std::min(DataBits, I->getType()->getPrimitiveSizeInBits());
NoopInput = Op;
- } else if (isa<CallInst>(I)) {
- // Look through call (skipping callee)
- for (User::const_op_iterator i = I->op_begin(), e = I->op_end() - 1;
- i != e; ++i) {
- unsigned attrInd = i - I->op_begin() + 1;
- if (cast<CallInst>(I)->paramHasAttr(attrInd, Attribute::Returned) &&
- isNoopBitcast((*i)->getType(), I->getType(), TLI)) {
- NoopInput = *i;
- break;
- }
- }
- } else if (isa<InvokeInst>(I)) {
- // Look through invoke (skipping BB, BB, Callee)
- for (User::const_op_iterator i = I->op_begin(), e = I->op_end() - 3;
- i != e; ++i) {
- unsigned attrInd = i - I->op_begin() + 1;
- if (cast<InvokeInst>(I)->paramHasAttr(attrInd, Attribute::Returned) &&
- isNoopBitcast((*i)->getType(), I->getType(), TLI)) {
- NoopInput = *i;
- break;
- }
- }
+ } else if (auto CS = ImmutableCallSite(I)) {
+ const Value *ReturnedOp = CS.getReturnedArgOperand();
+ if (ReturnedOp && isNoopBitcast(ReturnedOp->getType(), I->getType(), TLI))
+ NoopInput = ReturnedOp;
} else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(V)) {
// Value may come from either the aggregate or the scalar
ArrayRef<unsigned> InsertLoc = IVI->getIndices();
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index de0a4f0befa1..5f15ac1d503b 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -37,6 +37,8 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/MCValue.h"
@@ -2610,6 +2612,61 @@ AsmPrinterHandler::~AsmPrinterHandler() {}
void AsmPrinterHandler::markFunctionEnd() {}
+// In the binary's "xray_instr_map" section, an array of these function entries
+// describes each instrumentation point. When XRay patches your code, the index
+// into this table will be given to your handler as a patch point identifier.
+void AsmPrinter::XRayFunctionEntry::emit(int Bytes, MCStreamer *Out,
+ const MCSymbol *CurrentFnSym) const {
+ Out->EmitSymbolValue(Sled, Bytes);
+ Out->EmitSymbolValue(CurrentFnSym, Bytes);
+ auto Kind8 = static_cast<uint8_t>(Kind);
+ Out->EmitBytes(StringRef(reinterpret_cast<const char *>(&Kind8), 1));
+ Out->EmitBytes(
+ StringRef(reinterpret_cast<const char *>(&AlwaysInstrument), 1));
+ Out->EmitZeros(2 * Bytes - 2); // Pad the previous two entries
+}
+
+void AsmPrinter::emitXRayTable() {
+ if (Sleds.empty())
+ return;
+
+ auto PrevSection = OutStreamer->getCurrentSectionOnly();
+ auto Fn = MF->getFunction();
+ MCSection *Section = nullptr;
+ if (MF->getSubtarget().getTargetTriple().isOSBinFormatELF()) {
+ if (Fn->hasComdat()) {
+ Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
+ Fn->getComdat()->getName());
+ } else {
+ Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+ ELF::SHF_ALLOC);
+ }
+ } else if (MF->getSubtarget().getTargetTriple().isOSBinFormatMachO()) {
+ Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
+ SectionKind::getReadOnlyWithRel());
+ } else {
+ llvm_unreachable("Unsupported target");
+ }
+
+ // Before we switch over, we force a reference to a label inside the
+ // xray_instr_map section. Since this function is always called just
+ // before the function's end, we assume that this is happening after
+ // the last return instruction.
+
+ auto WordSizeBytes = TM.getPointerSize();
+ MCSymbol *Tmp = OutContext.createTempSymbol("xray_synthetic_", true);
+ OutStreamer->EmitCodeAlignment(16);
+ OutStreamer->EmitSymbolValue(Tmp, WordSizeBytes, false);
+ OutStreamer->SwitchSection(Section);
+ OutStreamer->EmitLabel(Tmp);
+ for (const auto &Sled : Sleds)
+ Sled.emit(WordSizeBytes, OutStreamer.get(), CurrentFnSym);
+
+ OutStreamer->SwitchSection(PrevSection);
+ Sleds.clear();
+}
+
void AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI,
SledKind Kind) {
auto Fn = MI.getParent()->getParent()->getFunction();
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 422f2dc2f2fb..3d81184f774a 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -1124,7 +1124,7 @@ void HoistSpillHelper::rmRedundantSpills(
// earlier spill with smaller SlotIndex.
for (const auto CurrentSpill : Spills) {
MachineBasicBlock *Block = CurrentSpill->getParent();
- MachineDomTreeNode *Node = MDT.DT->getNode(Block);
+ MachineDomTreeNode *Node = MDT.getBase().getNode(Block);
MachineInstr *PrevSpill = SpillBBToSpill[Node];
if (PrevSpill) {
SlotIndex PIdx = LIS.getInstructionIndex(*PrevSpill);
@@ -1132,9 +1132,9 @@ void HoistSpillHelper::rmRedundantSpills(
MachineInstr *SpillToRm = (CIdx > PIdx) ? CurrentSpill : PrevSpill;
MachineInstr *SpillToKeep = (CIdx > PIdx) ? PrevSpill : CurrentSpill;
SpillsToRm.push_back(SpillToRm);
- SpillBBToSpill[MDT.DT->getNode(Block)] = SpillToKeep;
+ SpillBBToSpill[MDT.getBase().getNode(Block)] = SpillToKeep;
} else {
- SpillBBToSpill[MDT.DT->getNode(Block)] = CurrentSpill;
+ SpillBBToSpill[MDT.getBase().getNode(Block)] = CurrentSpill;
}
}
for (const auto SpillToRm : SpillsToRm)
@@ -1209,7 +1209,7 @@ void HoistSpillHelper::getVisitOrders(
// Sort the nodes in WorkSet in top-down order and save the nodes
// in Orders. Orders will be used for hoisting in runHoistSpills.
unsigned idx = 0;
- Orders.push_back(MDT.DT->getNode(Root));
+ Orders.push_back(MDT.getBase().getNode(Root));
do {
MachineDomTreeNode *Node = Orders[idx++];
const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b4b41c3d0011..4632484055d2 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4277,7 +4277,8 @@ struct BaseIndexOffset {
}
/// Parses tree in Ptr for base, index, offset addresses.
- static BaseIndexOffset match(SDValue Ptr, SelectionDAG &DAG) {
+ static BaseIndexOffset match(SDValue Ptr, SelectionDAG &DAG,
+ int64_t PartialOffset = 0) {
bool IsIndexSignExt = false;
// Split up a folded GlobalAddress+Offset into its component parts.
@@ -4286,7 +4287,7 @@ struct BaseIndexOffset {
return BaseIndexOffset(DAG.getGlobalAddress(GA->getGlobal(),
SDLoc(GA),
GA->getValueType(0),
- /*Offset=*/0,
+ /*Offset=*/PartialOffset,
/*isTargetGA=*/false,
GA->getTargetFlags()),
SDValue(),
@@ -4298,14 +4299,13 @@ struct BaseIndexOffset {
// instruction, then it could be just the BASE or everything else we don't
// know how to handle. Just use Ptr as BASE and give up.
if (Ptr->getOpcode() != ISD::ADD)
- return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+ return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt);
// We know that we have at least an ADD instruction. Try to pattern match
// the simple case of BASE + OFFSET.
if (isa<ConstantSDNode>(Ptr->getOperand(1))) {
int64_t Offset = cast<ConstantSDNode>(Ptr->getOperand(1))->getSExtValue();
- return BaseIndexOffset(Ptr->getOperand(0), SDValue(), Offset,
- IsIndexSignExt);
+ return match(Ptr->getOperand(0), DAG, Offset + PartialOffset);
}
// Inside a loop the current BASE pointer is calculated using an ADD and a
@@ -4314,7 +4314,7 @@ struct BaseIndexOffset {
// (i64 mul (i64 %induction_var)
// (i64 %element_size)))
if (Ptr->getOperand(1)->getOpcode() == ISD::MUL)
- return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+ return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt);
// Look at Base + Index + Offset cases.
SDValue Base = Ptr->getOperand(0);
@@ -4328,14 +4328,14 @@ struct BaseIndexOffset {
// Either the case of Base + Index (no offset) or something else.
if (IndexOffset->getOpcode() != ISD::ADD)
- return BaseIndexOffset(Base, IndexOffset, 0, IsIndexSignExt);
+ return BaseIndexOffset(Base, IndexOffset, PartialOffset, IsIndexSignExt);
// Now we have the case of Base + Index + offset.
SDValue Index = IndexOffset->getOperand(0);
SDValue Offset = IndexOffset->getOperand(1);
if (!isa<ConstantSDNode>(Offset))
- return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+ return BaseIndexOffset(Ptr, SDValue(), PartialOffset, IsIndexSignExt);
// Ignore signextends.
if (Index->getOpcode() == ISD::SIGN_EXTEND) {
@@ -4344,7 +4344,7 @@ struct BaseIndexOffset {
} else IsIndexSignExt = false;
int64_t Off = cast<ConstantSDNode>(Offset)->getSExtValue();
- return BaseIndexOffset(Base, Index, Off, IsIndexSignExt);
+ return BaseIndexOffset(Base, Index, Off + PartialOffset, IsIndexSignExt);
}
};
} // namespace
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index 324d07118704..57b5d85bb550 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -88,15 +88,15 @@ void OProfileJITEventListener::NotifyObjectEmitted(
// Use symbol info to iterate functions in the object.
for (const std::pair<SymbolRef, uint64_t> &P : computeSymbolSizes(DebugObj)) {
SymbolRef Sym = P.first;
- if (Sym.getType() != SymbolRef::ST_Function)
+ if (!Sym.getType() || *Sym.getType() != SymbolRef::ST_Function)
continue;
- ErrorOr<StringRef> NameOrErr = Sym.getName();
- if (NameOrErr.getError())
+ Expected<StringRef> NameOrErr = Sym.getName();
+ if (!NameOrErr)
continue;
StringRef Name = *NameOrErr;
- ErrorOr<uint64_t> AddrOrErr = Sym.getAddress();
- if (AddrOrErr.getError())
+ Expected<uint64_t> AddrOrErr = Sym.getAddress();
+ if (!AddrOrErr)
continue;
uint64_t Addr = *AddrOrErr;
uint64_t Size = P.second;
@@ -128,9 +128,9 @@ void OProfileJITEventListener::NotifyFreeingObject(const ObjectFile &Obj) {
for (symbol_iterator I = DebugObj.symbol_begin(),
E = DebugObj.symbol_end();
I != E; ++I) {
- if (I->getType() == SymbolRef::ST_Function) {
- ErrorOr<uint64_t> AddrOrErr = I->getAddress();
- if (AddrOrErr.getError())
+ if (I->getType() && *I->getType() == SymbolRef::ST_Function) {
+ Expected<uint64_t> AddrOrErr = I->getAddress();
+ if (!AddrOrErr)
continue;
uint64_t Addr = *AddrOrErr;
diff --git a/lib/Fuzzer/FuzzerTracePC.cpp b/lib/Fuzzer/FuzzerTracePC.cpp
index 01c0b8c2ddb5..39d6e6026210 100644
--- a/lib/Fuzzer/FuzzerTracePC.cpp
+++ b/lib/Fuzzer/FuzzerTracePC.cpp
@@ -80,6 +80,7 @@ static bool IsInterestingCoverageFile(std::string &File) {
}
void TracePC::InitializePrintNewPCs() {
+ if (!DoPrintNewPCs) return;
assert(!PrintedPCs);
PrintedPCs = new std::set<uintptr_t>;
for (size_t i = 1; i < GetNumPCs(); i++)
@@ -88,6 +89,7 @@ void TracePC::InitializePrintNewPCs() {
}
void TracePC::PrintNewPCs() {
+ if (!DoPrintNewPCs) return;
assert(PrintedPCs);
for (size_t i = 1; i < GetNumPCs(); i++)
if (PCs[i] && PrintedPCs->insert(PCs[i]).second)
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index 2d9d0f95efa5..a87b9bec1ed2 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -342,8 +342,10 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
Name == "avx.cvt.ps2.pd.256" || // Added in 3.9
Name.startswith("avx.vinsertf128.") || // Added in 3.7
Name == "avx2.vinserti128" || // Added in 3.7
+ Name.startswith("avx512.mask.insert") || // Added in 4.0
Name.startswith("avx.vextractf128.") || // Added in 3.7
Name == "avx2.vextracti128" || // Added in 3.7
+ Name.startswith("avx512.mask.vextract") || // Added in 4.0
Name.startswith("sse4a.movnt.") || // Added in 3.9
Name.startswith("avx.movnt.") || // Added in 3.2
Name.startswith("avx512.storent.") || // Added in 3.9
@@ -1150,21 +1152,25 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
} else if (IsX86 && (Name.startswith("avx.vinsertf128.") ||
- Name == "avx2.vinserti128")) {
+ Name == "avx2.vinserti128" ||
+ Name.startswith("avx512.mask.insert"))) {
Value *Op0 = CI->getArgOperand(0);
Value *Op1 = CI->getArgOperand(1);
unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
- VectorType *VecTy = cast<VectorType>(CI->getType());
- unsigned NumElts = VecTy->getNumElements();
+ unsigned DstNumElts = CI->getType()->getVectorNumElements();
+ unsigned SrcNumElts = Op1->getType()->getVectorNumElements();
+ unsigned Scale = DstNumElts / SrcNumElts;
// Mask off the high bits of the immediate value; hardware ignores those.
- Imm = Imm & 1;
+ Imm = Imm % Scale;
- // Extend the second operand into a vector that is twice as big.
+ // Extend the second operand into a vector the size of the destination.
Value *UndefV = UndefValue::get(Op1->getType());
- SmallVector<uint32_t, 8> Idxs(NumElts);
- for (unsigned i = 0; i != NumElts; ++i)
+ SmallVector<uint32_t, 8> Idxs(DstNumElts);
+ for (unsigned i = 0; i != SrcNumElts; ++i)
Idxs[i] = i;
+ for (unsigned i = SrcNumElts; i != DstNumElts; ++i)
+ Idxs[i] = SrcNumElts;
Rep = Builder.CreateShuffleVector(Op1, UndefV, Idxs);
// Insert the second operand into the first operand.
@@ -1178,33 +1184,41 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
// Imm = 1 <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
// Imm = 0 <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7 >
- // The low half of the result is either the low half of the 1st operand
- // or the low half of the 2nd operand (the inserted vector).
- for (unsigned i = 0; i != NumElts / 2; ++i)
- Idxs[i] = Imm ? i : (i + NumElts);
- // The high half of the result is either the low half of the 2nd operand
- // (the inserted vector) or the high half of the 1st operand.
- for (unsigned i = NumElts / 2; i != NumElts; ++i)
- Idxs[i] = Imm ? (i + NumElts / 2) : i;
+ // First fill with identify mask.
+ for (unsigned i = 0; i != DstNumElts; ++i)
+ Idxs[i] = i;
+ // Then replace the elements where we need to insert.
+ for (unsigned i = 0; i != SrcNumElts; ++i)
+ Idxs[i + Imm * SrcNumElts] = i + DstNumElts;
Rep = Builder.CreateShuffleVector(Op0, Rep, Idxs);
+
+ // If the intrinsic has a mask operand, handle that.
+ if (CI->getNumArgOperands() == 5)
+ Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep,
+ CI->getArgOperand(3));
} else if (IsX86 && (Name.startswith("avx.vextractf128.") ||
- Name == "avx2.vextracti128")) {
+ Name == "avx2.vextracti128" ||
+ Name.startswith("avx512.mask.vextract"))) {
Value *Op0 = CI->getArgOperand(0);
unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
- VectorType *VecTy = cast<VectorType>(CI->getType());
- unsigned NumElts = VecTy->getNumElements();
+ unsigned DstNumElts = CI->getType()->getVectorNumElements();
+ unsigned SrcNumElts = Op0->getType()->getVectorNumElements();
+ unsigned Scale = SrcNumElts / DstNumElts;
// Mask off the high bits of the immediate value; hardware ignores those.
- Imm = Imm & 1;
+ Imm = Imm % Scale;
- // Get indexes for either the high half or low half of the input vector.
- SmallVector<uint32_t, 4> Idxs(NumElts);
- for (unsigned i = 0; i != NumElts; ++i) {
- Idxs[i] = Imm ? (i + NumElts) : i;
+ // Get indexes for the subvector of the input vector.
+ SmallVector<uint32_t, 8> Idxs(DstNumElts);
+ for (unsigned i = 0; i != DstNumElts; ++i) {
+ Idxs[i] = i + (Imm * DstNumElts);
}
+ Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
- Value *UndefV = UndefValue::get(Op0->getType());
- Rep = Builder.CreateShuffleVector(Op0, UndefV, Idxs);
+ // If the intrinsic has a mask operand, handle that.
+ if (CI->getNumArgOperands() == 4)
+ Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+ CI->getArgOperand(2));
} else if (!IsX86 && Name == "stackprotectorcheck") {
Rep = nullptr;
} else if (IsX86 && (Name.startswith("avx512.mask.perm.df.") ||
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index 7364f0e0cd31..42b3a344352b 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -891,23 +891,17 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
AddStream, Cache);
- // Partition numbers for ThinLTO jobs start at 1 (see comments for
- // GlobalResolution in LTO.h). Task numbers, however, start at
- // ParallelCodeGenParallelismLevel if an LTO module is present, as tasks 0
- // through ParallelCodeGenParallelismLevel-1 are reserved for parallel code
- // generation partitions.
+ // Task numbers start at ParallelCodeGenParallelismLevel if an LTO
+ // module is present, as tasks 0 through ParallelCodeGenParallelismLevel-1
+ // are reserved for parallel code generation partitions.
unsigned Task =
HasRegularLTO ? RegularLTO.ParallelCodeGenParallelismLevel : 0;
- unsigned Partition = 1;
-
for (auto &Mod : ThinLTO.ModuleMap) {
if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first],
ExportLists[Mod.first],
ResolvedODR[Mod.first], ThinLTO.ModuleMap))
return E;
-
++Task;
- ++Partition;
}
return BackendProc->wait();
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 30f0deab90a0..4cfbbf8645e0 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -76,8 +76,12 @@ namespace llvm {
compile-time arithmetic on PPC double-double numbers, it is not able
to represent all possible values held by a PPC double-double number,
for example: (long double) 1.0 + (long double) 0x1p-106
- Should this be replaced by a full emulation of PPC double-double? */
- static const fltSemantics semPPCDoubleDouble = {0, 0, 0, 0};
+ Should this be replaced by a full emulation of PPC double-double?
+
+ Note: we need to make the value different from semBogus as otherwise
+ an unsafe optimization may collapse both values to a single address,
+ and we heavily rely on them having distinct addresses. */
+ static const fltSemantics semPPCDoubleDouble = {-1, 0, 0, 0};
/* There are temporary semantics for the real PPCDoubleDouble implementation.
Currently, APFloat of PPCDoubleDouble holds one PPCDoubleDoubleImpl as the
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index dd19eee15f62..49d0ed55a716 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -1069,6 +1069,7 @@ StringRef sys::getHostCPUName() {
.Case("POWER7", "pwr7")
.Case("POWER8", "pwr8")
.Case("POWER8E", "pwr8")
+ .Case("POWER8NVL", "pwr8")
.Case("POWER9", "pwr9")
.Default(generic);
}
diff --git a/lib/Support/NativeFormatting.cpp b/lib/Support/NativeFormatting.cpp
index bb8689141098..b951a88a38db 100644
--- a/lib/Support/NativeFormatting.cpp
+++ b/lib/Support/NativeFormatting.cpp
@@ -239,10 +239,7 @@ void llvm::write_double(raw_ostream &S, double N, FloatStyle Style,
N *= 100.0;
char Buf[32];
- unsigned Len;
- Len = format(Spec.c_str(), N).snprint(Buf, sizeof(Buf));
- if (Style == FloatStyle::Percent)
- ++Len;
+ format(Spec.c_str(), N).snprint(Buf, sizeof(Buf));
S << Buf;
if (Style == FloatStyle::Percent)
S << '%';
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index 99d2070cb6ed..9849b3aa1ce9 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -118,6 +118,18 @@ void Input::beginMapping() {
}
}
+std::vector<StringRef> Input::keys() {
+ MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+ std::vector<StringRef> Ret;
+ if (!MN) {
+ setError(CurrentNode, "not a mapping");
+ return Ret;
+ }
+ for (auto &P : MN->Mapping)
+ Ret.push_back(P.first());
+ return Ret;
+}
+
bool Input::preflightKey(const char *Key, bool Required, bool, bool &UseDefault,
void *&SaveInfo) {
UseDefault = false;
@@ -163,7 +175,7 @@ void Input::endMapping() {
if (!MN)
return;
for (const auto &NN : MN->Mapping) {
- if (!MN->isValidKey(NN.first())) {
+ if (!is_contained(MN->ValidKeys, NN.first())) {
setError(NN.second.get(), Twine("unknown key '") + NN.first() + "'");
break;
}
@@ -373,14 +385,6 @@ std::unique_ptr<Input::HNode> Input::createHNodes(Node *N) {
}
}
-bool Input::MapHNode::isValidKey(StringRef Key) {
- for (const char *K : ValidKeys) {
- if (Key.equals(K))
- return true;
- }
- return false;
-}
-
void Input::setError(const Twine &Message) {
this->setError(CurrentNode, Message);
}
@@ -451,6 +455,10 @@ void Output::endMapping() {
StateStack.pop_back();
}
+std::vector<StringRef> Output::keys() {
+ report_fatal_error("invalid call");
+}
+
bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault,
bool &UseDefault, void *&) {
UseDefault = false;
diff --git a/lib/TableGen/StringMatcher.cpp b/lib/TableGen/StringMatcher.cpp
index 16681702d1d6..0c83da65e19e 100644
--- a/lib/TableGen/StringMatcher.cpp
+++ b/lib/TableGen/StringMatcher.cpp
@@ -11,9 +11,15 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/TableGen/StringMatcher.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/StringMatcher.h"
+#include <cassert>
#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
using namespace llvm;
/// FindFirstNonCommonLetter - Find the first character in the keys of the
@@ -67,7 +73,7 @@ EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
}
// Bucket the matches by the character we are comparing.
- std::map<char, std::vector<const StringPair*> > MatchesByLetter;
+ std::map<char, std::vector<const StringPair*>> MatchesByLetter;
for (unsigned i = 0, e = Matches.size(); i != e; ++i)
MatchesByLetter[Matches[i]->first[CharNo]].push_back(Matches[i]);
@@ -91,7 +97,7 @@ EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
// FIXME: Need to escape general strings.
OS << Indent << "if (memcmp(" << StrVariableName << ".data()+" << CharNo
<< ", \"" << Matches[0]->first.substr(CharNo, NumChars) << "\", "
- << NumChars << "))\n";
+ << NumChars << ") != 0)\n";
OS << Indent << " break;\n";
}
@@ -103,7 +109,7 @@ EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
OS << Indent << "switch (" << StrVariableName << "[" << CharNo << "]) {\n";
OS << Indent << "default: break;\n";
- for (std::map<char, std::vector<const StringPair*> >::iterator LI =
+ for (std::map<char, std::vector<const StringPair*>>::iterator LI =
MatchesByLetter.begin(), E = MatchesByLetter.end(); LI != E; ++LI) {
// TODO: escape hard stuff (like \n) if we ever care about it.
OS << Indent << "case '" << LI->first << "':\t // "
@@ -118,7 +124,6 @@ EmitStringMatcherForChar(const std::vector<const StringPair*> &Matches,
return true;
}
-
/// Emit - Top level entry point.
///
void StringMatcher::Emit(unsigned Indent) const {
@@ -126,7 +131,7 @@ void StringMatcher::Emit(unsigned Indent) const {
if (Matches.empty()) return;
// First level categorization: group strings by length.
- std::map<unsigned, std::vector<const StringPair*> > MatchesByLength;
+ std::map<unsigned, std::vector<const StringPair*>> MatchesByLength;
for (unsigned i = 0, e = Matches.size(); i != e; ++i)
MatchesByLength[Matches[i].first.size()].push_back(&Matches[i]);
@@ -136,7 +141,7 @@ void StringMatcher::Emit(unsigned Indent) const {
OS.indent(Indent*2+2) << "switch (" << StrVariableName << ".size()) {\n";
OS.indent(Indent*2+2) << "default: break;\n";
- for (std::map<unsigned, std::vector<const StringPair*> >::iterator LI =
+ for (std::map<unsigned, std::vector<const StringPair*>>::iterator LI =
MatchesByLength.begin(), E = MatchesByLength.end(); LI != E; ++LI) {
OS.indent(Indent*2+2) << "case " << LI->first << ":\t // "
<< LI->second.size()
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index c40391d5ad9d..740766b151bb 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -264,9 +264,13 @@ def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
"Qualcomm Falkor processors", [
FeatureCRC,
FeatureCrypto,
+ FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
FeatureNEON,
- FeaturePerfMon
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive,
+ FeatureZCZeroing
]>;
def ProcVulcan : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index b2d96a32fd3a..efc221893782 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -76,7 +76,6 @@ public:
void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
- void EmitXRayTable();
void EmitSled(const MachineInstr &MI, SledKind Kind);
/// \brief tblgen'erated driver function for lowering simple MI->MC
@@ -95,7 +94,7 @@ public:
AArch64FI = F.getInfo<AArch64FunctionInfo>();
STI = static_cast<const AArch64Subtarget*>(&F.getSubtarget());
bool Result = AsmPrinter::runOnMachineFunction(F);
- EmitXRayTable();
+ emitXRayTable();
return Result;
}
@@ -150,59 +149,6 @@ void AArch64AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI)
EmitSled(MI, SledKind::TAIL_CALL);
}
-void AArch64AsmPrinter::EmitXRayTable()
-{
- //TODO: merge the logic for ELF XRay sleds at a higher level, so to avoid
- // code duplication as it is now for x86_64, ARM32 and AArch64.
- if (Sleds.empty())
- return;
-
- auto PrevSection = OutStreamer->getCurrentSectionOnly();
- auto Fn = MF->getFunction();
- MCSection *Section;
-
- if (STI->isTargetELF()) {
- if (Fn->hasComdat())
- Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
- ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
- Fn->getComdat()->getName());
- else
- Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
- ELF::SHF_ALLOC);
- } else if (STI->isTargetMachO()) {
- Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
- SectionKind::getReadOnlyWithRel());
- } else {
- llvm_unreachable("Unsupported target");
- }
-
- // Before we switch over, we force a reference to a label inside the
- // xray_instr_map section. Since EmitXRayTable() is always called just
- // before the function's end, we assume that this is happening after the
- // last return instruction.
- //
- // We then align the reference to 16 byte boundaries, which we determined
- // experimentally to be beneficial to avoid causing decoder stalls.
- MCSymbol *Tmp = OutContext.createTempSymbol("xray_synthetic_", true);
- OutStreamer->EmitCodeAlignment(16);
- OutStreamer->EmitSymbolValue(Tmp, 8, false);
- OutStreamer->SwitchSection(Section);
- OutStreamer->EmitLabel(Tmp);
- for (const auto &Sled : Sleds) {
- OutStreamer->EmitSymbolValue(Sled.Sled, 8);
- OutStreamer->EmitSymbolValue(CurrentFnSym, 8);
- auto Kind = static_cast<uint8_t>(Sled.Kind);
- OutStreamer->EmitBytes(
- StringRef(reinterpret_cast<const char *>(&Kind), 1));
- OutStreamer->EmitBytes(
- StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
- OutStreamer->EmitZeros(14);
- }
- OutStreamer->SwitchSection(PrevSection);
-
- Sleds.clear();
-}
-
void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
{
static const int8_t NoopsInSledCount = 7;
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index dcb05601e5f4..8a76c42b5898 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1470,6 +1470,9 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
bool IsUnscaled = TII->isUnscaledLdSt(MI);
int Offset = getLdStOffsetOp(MI).getImm();
int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
+ // Allow one more for offset.
+ if (Offset > 0)
+ Offset -= OffsetStride;
if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
return false;
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a87204d46eae..0b0a0e7d083e 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3048,6 +3048,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(KILL)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(SENDMSG)
+ NODE_NAME_CASE(SENDMSGHALT)
NODE_NAME_CASE(INTERP_MOV)
NODE_NAME_CASE(INTERP_P1)
NODE_NAME_CASE(INTERP_P2)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 5cc5efb331e3..745c9923de2e 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -313,6 +313,7 @@ enum NodeType : unsigned {
/// Pointer to the start of the shader's constant data.
CONST_DATA_PTR,
SENDMSG,
+ SENDMSGHALT,
INTERP_MOV,
INTERP_P1,
INTERP_P2,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index e7b40016e272..f079c8d0c70c 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -266,6 +266,10 @@ def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
SDTypeProfile<0, 1, [SDTCisInt<0>]>,
[SDNPHasChain, SDNPInGlue]>;
+def AMDGPUsendmsghalt : SDNode<"AMDGPUISD::SENDMSGHALT",
+ SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+ [SDNPHasChain, SDNPInGlue]>;
+
def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV",
SDTypeProfile<1, 3, [SDTCisFP<0>]>,
[SDNPInGlue]>;
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index fa53831cbe16..c78e97dfd46f 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2706,12 +2706,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
switch (IntrinsicID) {
- case AMDGPUIntrinsic::SI_sendmsg: {
+ case AMDGPUIntrinsic::SI_sendmsg:
+ case Intrinsic::amdgcn_s_sendmsg: {
Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
SDValue Glue = Chain.getValue(1);
return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
Op.getOperand(2), Glue);
}
+ case Intrinsic::amdgcn_s_sendmsghalt: {
+ Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+ SDValue Glue = Chain.getValue(1);
+ return DAG.getNode(AMDGPUISD::SENDMSGHALT, DL, MVT::Other, Chain,
+ Op.getOperand(2), Glue);
+ }
case AMDGPUIntrinsic::SI_tbuffer_store: {
SDValue Ops[] = {
Chain,
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
index 202a1e9ed8ac..fceabd7a8fdd 100644
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -504,7 +504,7 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
return;
// There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
- if (LastInstWritesM0 && I->getOpcode() == AMDGPU::S_SENDMSG) {
+ if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
LastInstWritesM0 = false;
return;
@@ -619,7 +619,8 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
// signalling other hardware blocks
if ((I->getOpcode() == AMDGPU::S_BARRIER &&
ST->needWaitcntBeforeBarrier()) ||
- I->getOpcode() == AMDGPU::S_SENDMSG)
+ I->getOpcode() == AMDGPU::S_SENDMSG ||
+ I->getOpcode() == AMDGPU::S_SENDMSGHALT)
Required = LastIssued;
else
Required = handleOperands(*I);
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index 0aeb1297d3a7..73cd5774128e 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -828,9 +828,12 @@ let Uses = [EXEC, M0] in {
def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
[(AMDGPUsendmsg (i32 imm:$simm16))]
>;
+
+def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16",
+ [(AMDGPUsendmsghalt (i32 imm:$simm16))]
+>;
} // End Uses = [EXEC, M0]
-def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16">;
def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16">;
def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
let simm16 = 0;
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index f20768ab77a5..8ec9cb02813c 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -164,9 +164,6 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// Emit the rest of the function body.
EmitFunctionBody();
- // Emit the XRay table for this function.
- EmitXRayTable();
-
// If we need V4T thumb mode Register Indirect Jump pads, emit them.
// These are created per function, rather than per TU, since it's
// relatively easy to exceed the thumb branch range within a TU.
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index ce0b04d56d9e..93fed10eb2d0 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -113,9 +113,6 @@ public:
void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
- // Helper function that emits the XRay sleds we've collected for a particular
- // function.
- void EmitXRayTable();
private:
void EmitSled(const MachineInstr &MI, SledKind Kind);
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 293a527b09e8..07044b9697b6 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -22,9 +22,6 @@
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCStreamer.h"
using namespace llvm;
@@ -226,38 +223,3 @@ void ARMAsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI)
{
EmitSled(MI, SledKind::TAIL_CALL);
}
-
-void ARMAsmPrinter::EmitXRayTable()
-{
- if (Sleds.empty())
- return;
-
- MCSection *Section = nullptr;
- if (Subtarget->isTargetELF()) {
- Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
- ELF::SHF_ALLOC | ELF::SHF_GROUP |
- ELF::SHF_MERGE,
- 0, CurrentFnSym->getName());
- } else if (Subtarget->isTargetMachO()) {
- Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
- SectionKind::getReadOnlyWithRel());
- } else {
- llvm_unreachable("Unsupported target");
- }
-
- auto PrevSection = OutStreamer->getCurrentSectionOnly();
- OutStreamer->SwitchSection(Section);
- for (const auto &Sled : Sleds) {
- OutStreamer->EmitSymbolValue(Sled.Sled, 4);
- OutStreamer->EmitSymbolValue(CurrentFnSym, 4);
- auto Kind = static_cast<uint8_t>(Sled.Kind);
- OutStreamer->EmitBytes(
- StringRef(reinterpret_cast<const char *>(&Kind), 1));
- OutStreamer->EmitBytes(
- StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
- OutStreamer->EmitZeros(6);
- }
- OutStreamer->SwitchSection(PrevSection);
-
- Sleds.clear();
-}
diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp
index c0591c332dea..963fb99ce09b 100644
--- a/lib/Target/Hexagon/BitTracker.cpp
+++ b/lib/Target/Hexagon/BitTracker.cpp
@@ -53,28 +53,36 @@
//
// The code below is intended to be fully target-independent.
+#include "BitTracker.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/BitVector.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetRegisterInfo.h"
-
-#include "BitTracker.h"
+#include <iterator>
+#include <cassert>
+#include <cstdint>
using namespace llvm;
typedef BitTracker BT;
namespace {
+
// Local trickery to pretty print a register (without the whole "%vreg"
// business).
struct printv {
printv(unsigned r) : R(r) {}
+
unsigned R;
};
+
raw_ostream &operator<< (raw_ostream &OS, const printv &PV) {
if (PV.R)
OS << 'v' << TargetRegisterInfo::virtReg2Index(PV.R);
@@ -82,9 +90,11 @@ namespace {
OS << 's';
return OS;
}
-}
+
+} // end anonymous namespace
namespace llvm {
+
raw_ostream &operator<<(raw_ostream &OS, const BT::BitValue &BV) {
switch (BV.Type) {
case BT::BitValue::Top:
@@ -167,14 +177,14 @@ namespace llvm {
return OS;
}
-}
+
+} // end namespace llvm
void BitTracker::print_cells(raw_ostream &OS) const {
for (CellMapType::iterator I = Map.begin(), E = Map.end(); I != E; ++I)
dbgs() << PrintReg(I->first, &ME.TRI) << " -> " << I->second << "\n";
}
-
BitTracker::BitTracker(const MachineEvaluator &E, MachineFunction &F)
: Trace(false), ME(E), MF(F), MRI(F.getRegInfo()), Map(*new CellMapType) {}
@@ -182,7 +192,6 @@ BitTracker::~BitTracker() {
delete &Map;
}
-
// If we were allowed to update a cell for a part of a register, the meet
// operation would need to be parametrized by the register number and the
// exact part of the register, so that the computer BitRefs correspond to
@@ -201,7 +210,6 @@ bool BT::RegisterCell::meet(const RegisterCell &RC, unsigned SelfR) {
return Changed;
}
-
// Insert the entire cell RC into the current cell at position given by M.
BT::RegisterCell &BT::RegisterCell::insert(const BT::RegisterCell &RC,
const BitMask &M) {
@@ -224,7 +232,6 @@ BT::RegisterCell &BT::RegisterCell::insert(const BT::RegisterCell &RC,
return *this;
}
-
BT::RegisterCell BT::RegisterCell::extract(const BitMask &M) const {
uint16_t B = M.first(), E = M.last(), W = width();
assert(B < W && E < W);
@@ -243,7 +250,6 @@ BT::RegisterCell BT::RegisterCell::extract(const BitMask &M) const {
return RC;
}
-
BT::RegisterCell &BT::RegisterCell::rol(uint16_t Sh) {
// Rotate left (i.e. towards increasing bit indices).
// Swap the two parts: [0..W-Sh-1] [W-Sh..W-1]
@@ -265,7 +271,6 @@ BT::RegisterCell &BT::RegisterCell::rol(uint16_t Sh) {
return *this;
}
-
BT::RegisterCell &BT::RegisterCell::fill(uint16_t B, uint16_t E,
const BitValue &V) {
assert(B <= E);
@@ -274,7 +279,6 @@ BT::RegisterCell &BT::RegisterCell::fill(uint16_t B, uint16_t E,
return *this;
}
-
BT::RegisterCell &BT::RegisterCell::cat(const RegisterCell &RC) {
// Append the cell given as the argument to the "this" cell.
// Bit 0 of RC becomes bit W of the result, where W is this->width().
@@ -285,7 +289,6 @@ BT::RegisterCell &BT::RegisterCell::cat(const RegisterCell &RC) {
return *this;
}
-
uint16_t BT::RegisterCell::ct(bool B) const {
uint16_t W = width();
uint16_t C = 0;
@@ -295,7 +298,6 @@ uint16_t BT::RegisterCell::ct(bool B) const {
return C;
}
-
uint16_t BT::RegisterCell::cl(bool B) const {
uint16_t W = width();
uint16_t C = 0;
@@ -305,7 +307,6 @@ uint16_t BT::RegisterCell::cl(bool B) const {
return C;
}
-
bool BT::RegisterCell::operator== (const RegisterCell &RC) const {
uint16_t W = Bits.size();
if (RC.Bits.size() != W)
@@ -316,7 +317,6 @@ bool BT::RegisterCell::operator== (const RegisterCell &RC) const {
return true;
}
-
uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
// The general problem is with finding a register class that corresponds
// to a given reference reg:sub. There can be several such classes, and
@@ -342,7 +342,6 @@ uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
return BW;
}
-
BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR,
const CellMapType &M) const {
uint16_t BW = getRegBitWidth(RR);
@@ -370,7 +369,6 @@ BT::RegisterCell BT::MachineEvaluator::getCell(const RegisterRef &RR,
return RegisterCell::top(BW);
}
-
void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
CellMapType &M) const {
// While updating the cell map can be done in a meaningful way for
@@ -388,7 +386,6 @@ void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
M[RR.Reg] = RC;
}
-
// Check if the cell represents a compile-time integer value.
bool BT::MachineEvaluator::isInt(const RegisterCell &A) const {
uint16_t W = A.width();
@@ -398,7 +395,6 @@ bool BT::MachineEvaluator::isInt(const RegisterCell &A) const {
return true;
}
-
// Convert a cell to the integer value. The result must fit in uint64_t.
uint64_t BT::MachineEvaluator::toInt(const RegisterCell &A) const {
assert(isInt(A));
@@ -411,7 +407,6 @@ uint64_t BT::MachineEvaluator::toInt(const RegisterCell &A) const {
return Val;
}
-
// Evaluator helper functions. These implement some common operation on
// register cells that can be used to implement target-specific instructions
// in a target-specific evaluator.
@@ -426,7 +421,6 @@ BT::RegisterCell BT::MachineEvaluator::eIMM(int64_t V, uint16_t W) const {
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eIMM(const ConstantInt *CI) const {
const APInt &A = CI->getValue();
uint16_t BW = A.getBitWidth();
@@ -437,7 +431,6 @@ BT::RegisterCell BT::MachineEvaluator::eIMM(const ConstantInt *CI) const {
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eADD(const RegisterCell &A1,
const RegisterCell &A2) const {
uint16_t W = A1.width();
@@ -471,7 +464,6 @@ BT::RegisterCell BT::MachineEvaluator::eADD(const RegisterCell &A1,
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eSUB(const RegisterCell &A1,
const RegisterCell &A2) const {
uint16_t W = A1.width();
@@ -505,29 +497,26 @@ BT::RegisterCell BT::MachineEvaluator::eSUB(const RegisterCell &A1,
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eMLS(const RegisterCell &A1,
const RegisterCell &A2) const {
uint16_t W = A1.width() + A2.width();
- uint16_t Z = A1.ct(0) + A2.ct(0);
+ uint16_t Z = A1.ct(false) + A2.ct(false);
RegisterCell Res(W);
Res.fill(0, Z, BitValue::Zero);
Res.fill(Z, W, BitValue::self());
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eMLU(const RegisterCell &A1,
const RegisterCell &A2) const {
uint16_t W = A1.width() + A2.width();
- uint16_t Z = A1.ct(0) + A2.ct(0);
+ uint16_t Z = A1.ct(false) + A2.ct(false);
RegisterCell Res(W);
Res.fill(0, Z, BitValue::Zero);
Res.fill(Z, W, BitValue::self());
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eASL(const RegisterCell &A1,
uint16_t Sh) const {
assert(Sh <= A1.width());
@@ -537,7 +526,6 @@ BT::RegisterCell BT::MachineEvaluator::eASL(const RegisterCell &A1,
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eLSR(const RegisterCell &A1,
uint16_t Sh) const {
uint16_t W = A1.width();
@@ -548,7 +536,6 @@ BT::RegisterCell BT::MachineEvaluator::eLSR(const RegisterCell &A1,
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eASR(const RegisterCell &A1,
uint16_t Sh) const {
uint16_t W = A1.width();
@@ -560,7 +547,6 @@ BT::RegisterCell BT::MachineEvaluator::eASR(const RegisterCell &A1,
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eAND(const RegisterCell &A1,
const RegisterCell &A2) const {
uint16_t W = A1.width();
@@ -583,7 +569,6 @@ BT::RegisterCell BT::MachineEvaluator::eAND(const RegisterCell &A1,
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eORL(const RegisterCell &A1,
const RegisterCell &A2) const {
uint16_t W = A1.width();
@@ -606,7 +591,6 @@ BT::RegisterCell BT::MachineEvaluator::eORL(const RegisterCell &A1,
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eXOR(const RegisterCell &A1,
const RegisterCell &A2) const {
uint16_t W = A1.width();
@@ -627,7 +611,6 @@ BT::RegisterCell BT::MachineEvaluator::eXOR(const RegisterCell &A1,
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eNOT(const RegisterCell &A1) const {
uint16_t W = A1.width();
RegisterCell Res(W);
@@ -643,7 +626,6 @@ BT::RegisterCell BT::MachineEvaluator::eNOT(const RegisterCell &A1) const {
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eSET(const RegisterCell &A1,
uint16_t BitN) const {
assert(BitN < A1.width());
@@ -652,7 +634,6 @@ BT::RegisterCell BT::MachineEvaluator::eSET(const RegisterCell &A1,
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eCLR(const RegisterCell &A1,
uint16_t BitN) const {
assert(BitN < A1.width());
@@ -661,7 +642,6 @@ BT::RegisterCell BT::MachineEvaluator::eCLR(const RegisterCell &A1,
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eCLB(const RegisterCell &A1, bool B,
uint16_t W) const {
uint16_t C = A1.cl(B), AW = A1.width();
@@ -672,7 +652,6 @@ BT::RegisterCell BT::MachineEvaluator::eCLB(const RegisterCell &A1, bool B,
return RegisterCell::self(0, W);
}
-
BT::RegisterCell BT::MachineEvaluator::eCTB(const RegisterCell &A1, bool B,
uint16_t W) const {
uint16_t C = A1.ct(B), AW = A1.width();
@@ -683,7 +662,6 @@ BT::RegisterCell BT::MachineEvaluator::eCTB(const RegisterCell &A1, bool B,
return RegisterCell::self(0, W);
}
-
BT::RegisterCell BT::MachineEvaluator::eSXT(const RegisterCell &A1,
uint16_t FromN) const {
uint16_t W = A1.width();
@@ -695,7 +673,6 @@ BT::RegisterCell BT::MachineEvaluator::eSXT(const RegisterCell &A1,
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eZXT(const RegisterCell &A1,
uint16_t FromN) const {
uint16_t W = A1.width();
@@ -705,7 +682,6 @@ BT::RegisterCell BT::MachineEvaluator::eZXT(const RegisterCell &A1,
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eXTR(const RegisterCell &A1,
uint16_t B, uint16_t E) const {
uint16_t W = A1.width();
@@ -718,7 +694,6 @@ BT::RegisterCell BT::MachineEvaluator::eXTR(const RegisterCell &A1,
return Res;
}
-
BT::RegisterCell BT::MachineEvaluator::eINS(const RegisterCell &A1,
const RegisterCell &A2, uint16_t AtN) const {
uint16_t W1 = A1.width(), W2 = A2.width();
@@ -731,7 +706,6 @@ BT::RegisterCell BT::MachineEvaluator::eINS(const RegisterCell &A1,
return Res;
}
-
BT::BitMask BT::MachineEvaluator::mask(unsigned Reg, unsigned Sub) const {
assert(Sub == 0 && "Generic BitTracker::mask called for Sub != 0");
uint16_t W = getRegBitWidth(Reg);
@@ -785,7 +759,6 @@ bool BT::MachineEvaluator::evaluate(const MachineInstr &MI,
return true;
}
-
// Main W-Z implementation.
void BT::visitPHI(const MachineInstr &PI) {
@@ -977,7 +950,6 @@ void BT::visitBranchesFrom(const MachineInstr &BI) {
}
}
-
void BT::visitUsesOf(unsigned Reg) {
if (Trace)
dbgs() << "visiting uses of " << PrintReg(Reg, &ME.TRI) << "\n";
@@ -997,17 +969,14 @@ void BT::visitUsesOf(unsigned Reg) {
}
}
-
BT::RegisterCell BT::get(RegisterRef RR) const {
return ME.getCell(RR, Map);
}
-
void BT::put(RegisterRef RR, const RegisterCell &RC) {
ME.putCell(RR, RC, Map);
}
-
// Replace all references to bits from OldRR with the corresponding bits
// in NewRR.
void BT::subst(RegisterRef OldRR, RegisterRef NewRR) {
@@ -1033,7 +1002,6 @@ void BT::subst(RegisterRef OldRR, RegisterRef NewRR) {
}
}
-
// Check if the block has been "executed" during propagation. (If not, the
// block is dead, but it may still appear to be reachable.)
bool BT::reached(const MachineBasicBlock *B) const {
@@ -1047,7 +1015,6 @@ bool BT::reached(const MachineBasicBlock *B) const {
return false;
}
-
// Visit an individual instruction. This could be a newly added instruction,
// or one that has been modified by an optimization.
void BT::visit(const MachineInstr &MI) {
@@ -1061,14 +1028,12 @@ void BT::visit(const MachineInstr &MI) {
FlowQ.pop();
}
-
void BT::reset() {
EdgeExec.clear();
InstrExec.clear();
Map.clear();
}
-
void BT::run() {
reset();
assert(FlowQ.empty());
@@ -1141,4 +1106,3 @@ void BT::run() {
if (Trace)
print_cells(dbgs() << "Cells after propagation:\n");
}
-
diff --git a/lib/Target/Hexagon/BitTracker.h b/lib/Target/Hexagon/BitTracker.h
index 74cafcd00b60..48c5f2266acf 100644
--- a/lib/Target/Hexagon/BitTracker.h
+++ b/lib/Target/Hexagon/BitTracker.h
@@ -1,4 +1,4 @@
-//===--- BitTracker.h -----------------------------------------------------===//
+//===--- BitTracker.h -------------------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,24 +7,27 @@
//
//===----------------------------------------------------------------------===//
-#ifndef BITTRACKER_H
-#define BITTRACKER_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_BITTRACKER_H
+#define LLVM_LIB_TARGET_HEXAGON_BITTRACKER_H
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineFunction.h"
-
+#include "llvm/CodeGen/MachineOperand.h"
+#include <cassert>
+#include <cstdint>
#include <map>
#include <queue>
#include <set>
+#include <utility>
namespace llvm {
- class ConstantInt;
- class MachineRegisterInfo;
- class MachineBasicBlock;
- class MachineInstr;
- class MachineOperand;
- class raw_ostream;
+
+class ConstantInt;
+class MachineRegisterInfo;
+class MachineBasicBlock;
+class MachineInstr;
+class raw_ostream;
struct BitTracker {
struct BitRef;
@@ -76,19 +79,19 @@ private:
CellMapType &Map;
};
-
// Abstraction of a reference to bit at position Pos from a register Reg.
struct BitTracker::BitRef {
BitRef(unsigned R = 0, uint16_t P = 0) : Reg(R), Pos(P) {}
+
bool operator== (const BitRef &BR) const {
// If Reg is 0, disregard Pos.
return Reg == BR.Reg && (Reg == 0 || Pos == BR.Pos);
}
+
unsigned Reg;
uint16_t Pos;
};
-
// Abstraction of a register reference in MachineOperand. It contains the
// register number and the subregister index.
struct BitTracker::RegisterRef {
@@ -96,10 +99,10 @@ struct BitTracker::RegisterRef {
: Reg(R), Sub(S) {}
RegisterRef(const MachineOperand &MO)
: Reg(MO.getReg()), Sub(MO.getSubReg()) {}
+
unsigned Reg, Sub;
};
-
// Value that a single bit can take. This is outside of the context of
// any register, it is more of an abstraction of the two-element set of
// possible bit values. One extension here is the "Ref" type, which
@@ -158,6 +161,7 @@ struct BitTracker::BitValue {
bool operator!= (const BitValue &V) const {
return !operator==(V);
}
+
bool is(unsigned T) const {
assert(T == 0 || T == 1);
return T == 0 ? Type == Zero
@@ -209,6 +213,7 @@ struct BitTracker::BitValue {
bool num() const {
return Type == Zero || Type == One;
}
+
operator bool() const {
assert(Type == Zero || Type == One);
return Type == One;
@@ -217,7 +222,6 @@ struct BitTracker::BitValue {
friend raw_ostream &operator<<(raw_ostream &OS, const BitValue &BV);
};
-
// This operation must be idempotent, i.e. ref(ref(V)) == ref(V).
inline BitTracker::BitValue
BitTracker::BitValue::ref(const BitValue &V) {
@@ -228,26 +232,26 @@ BitTracker::BitValue::ref(const BitValue &V) {
return self();
}
-
inline BitTracker::BitValue
BitTracker::BitValue::self(const BitRef &Self) {
return BitValue(Self.Reg, Self.Pos);
}
-
// A sequence of bits starting from index B up to and including index E.
// If E < B, the mask represents two sections: [0..E] and [B..W) where
// W is the width of the register.
struct BitTracker::BitMask {
- BitMask() : B(0), E(0) {}
+ BitMask() = default;
BitMask(uint16_t b, uint16_t e) : B(b), E(e) {}
+
uint16_t first() const { return B; }
uint16_t last() const { return E; }
+
private:
- uint16_t B, E;
+ uint16_t B = 0;
+ uint16_t E = 0;
};
-
// Representation of a register: a list of BitValues.
struct BitTracker::RegisterCell {
RegisterCell(uint16_t Width = DefaultBitN) : Bits(Width) {}
@@ -255,6 +259,7 @@ struct BitTracker::RegisterCell {
uint16_t width() const {
return Bits.size();
}
+
const BitValue &operator[](uint16_t BitN) const {
assert(BitN < Bits.size());
return Bits[BitN];
@@ -297,12 +302,10 @@ private:
friend raw_ostream &operator<<(raw_ostream &OS, const RegisterCell &RC);
};
-
inline bool BitTracker::has(unsigned Reg) const {
return Map.find(Reg) != Map.end();
}
-
inline const BitTracker::RegisterCell&
BitTracker::lookup(unsigned Reg) const {
CellMapType::const_iterator F = Map.find(Reg);
@@ -310,7 +313,6 @@ BitTracker::lookup(unsigned Reg) const {
return F->second;
}
-
inline BitTracker::RegisterCell
BitTracker::RegisterCell::self(unsigned Reg, uint16_t Width) {
RegisterCell RC(Width);
@@ -319,7 +321,6 @@ BitTracker::RegisterCell::self(unsigned Reg, uint16_t Width) {
return RC;
}
-
inline BitTracker::RegisterCell
BitTracker::RegisterCell::top(uint16_t Width) {
RegisterCell RC(Width);
@@ -328,7 +329,6 @@ BitTracker::RegisterCell::top(uint16_t Width) {
return RC;
}
-
inline BitTracker::RegisterCell
BitTracker::RegisterCell::ref(const RegisterCell &C) {
uint16_t W = C.width();
@@ -345,12 +345,13 @@ BitTracker::RegisterCell::ref(const RegisterCell &C) {
struct BitTracker::MachineEvaluator {
MachineEvaluator(const TargetRegisterInfo &T, MachineRegisterInfo &M)
: TRI(T), MRI(M) {}
- virtual ~MachineEvaluator() {}
+ virtual ~MachineEvaluator() = default;
uint16_t getRegBitWidth(const RegisterRef &RR) const;
RegisterCell getCell(const RegisterRef &RR, const CellMapType &M) const;
void putCell(const RegisterRef &RR, RegisterCell RC, CellMapType &M) const;
+
// A result of any operation should use refs to the source cells, not
// the cells directly. This function is a convenience wrapper to quickly
// generate a ref for a cell corresponding to a register reference.
@@ -435,4 +436,4 @@ struct BitTracker::MachineEvaluator {
} // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_HEXAGON_BITTRACKER_H
diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp
index b78c4126e0b1..436f88dcd450 100644
--- a/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -7,16 +7,30 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
#include "Hexagon.h"
+#include "HexagonBitTracker.h"
#include "HexagonInstrInfo.h"
#include "HexagonRegisterInfo.h"
#include "HexagonTargetMachine.h"
-#include "HexagonBitTracker.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <utility>
+#include <vector>
using namespace llvm;
@@ -76,11 +90,11 @@ HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri,
}
}
-
BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
+ using namespace Hexagon;
+
if (Sub == 0)
return MachineEvaluator::mask(Reg, 0);
- using namespace Hexagon;
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
unsigned ID = RC->getID();
uint16_t RW = getRegBitWidth(RegisterRef(Reg, Sub));
@@ -102,6 +116,7 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const {
}
namespace {
+
class RegisterRefs {
std::vector<BT::RegisterRef> Vector;
@@ -117,17 +132,21 @@ public:
}
size_t size() const { return Vector.size(); }
+
const BT::RegisterRef &operator[](unsigned n) const {
// The main purpose of this operator is to assert with bad argument.
assert(n < Vector.size());
return Vector[n];
}
};
-}
+
+} // end anonymous namespace
bool HexagonEvaluator::evaluate(const MachineInstr &MI,
const CellMapType &Inputs,
CellMapType &Outputs) const {
+ using namespace Hexagon;
+
unsigned NumDefs = 0;
// Sanity verification: there should not be any defs with subregisters.
@@ -142,7 +161,6 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
if (NumDefs == 0)
return false;
- using namespace Hexagon;
unsigned Opc = MI.getOpcode();
if (MI.mayLoad()) {
@@ -779,10 +797,10 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
case S2_cl0:
case S2_cl0p:
// Always produce a 32-bit result.
- return rr0(eCLB(rc(1), 0/*bit*/, 32), Outputs);
+ return rr0(eCLB(rc(1), false/*bit*/, 32), Outputs);
case S2_cl1:
case S2_cl1p:
- return rr0(eCLB(rc(1), 1/*bit*/, 32), Outputs);
+ return rr0(eCLB(rc(1), true/*bit*/, 32), Outputs);
case S2_clb:
case S2_clbp: {
uint16_t W1 = getRegBitWidth(Reg[1]);
@@ -794,10 +812,10 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
}
case S2_ct0:
case S2_ct0p:
- return rr0(eCTB(rc(1), 0/*bit*/, 32), Outputs);
+ return rr0(eCTB(rc(1), false/*bit*/, 32), Outputs);
case S2_ct1:
case S2_ct1p:
- return rr0(eCTB(rc(1), 1/*bit*/, 32), Outputs);
+ return rr0(eCTB(rc(1), true/*bit*/, 32), Outputs);
case S5_popcountp:
// TODO
break;
@@ -953,6 +971,8 @@ bool HexagonEvaluator::evaluate(const MachineInstr &BI,
bool HexagonEvaluator::evaluateLoad(const MachineInstr &MI,
const CellMapType &Inputs,
CellMapType &Outputs) const {
+ using namespace Hexagon;
+
if (TII.isPredicated(MI))
return false;
assert(MI.mayLoad() && "A load that mayn't?");
@@ -960,7 +980,6 @@ bool HexagonEvaluator::evaluateLoad(const MachineInstr &MI,
uint16_t BitNum;
bool SignEx;
- using namespace Hexagon;
switch (Opc) {
default:
@@ -1141,9 +1160,9 @@ bool HexagonEvaluator::evaluateFormalCopy(const MachineInstr &MI,
return true;
}
-
unsigned HexagonEvaluator::getNextPhysReg(unsigned PReg, unsigned Width) const {
using namespace Hexagon;
+
bool Is64 = DoubleRegsRegClass.contains(PReg);
assert(PReg == 0 || Is64 || IntRegsRegClass.contains(PReg));
@@ -1180,7 +1199,6 @@ unsigned HexagonEvaluator::getNextPhysReg(unsigned PReg, unsigned Width) const {
return (Idx64+1 < Num64) ? Phys64[Idx64+1] : 0;
}
-
unsigned HexagonEvaluator::getVirtRegFor(unsigned PReg) const {
typedef MachineRegisterInfo::livein_iterator iterator;
for (iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) {
diff --git a/lib/Target/Hexagon/HexagonBitTracker.h b/lib/Target/Hexagon/HexagonBitTracker.h
index 9e7b1dbe298f..2cbf65e66ca6 100644
--- a/lib/Target/Hexagon/HexagonBitTracker.h
+++ b/lib/Target/Hexagon/HexagonBitTracker.h
@@ -1,4 +1,4 @@
-//===--- HexagonBitTracker.h ----------------------------------------------===//
+//===--- HexagonBitTracker.h ------------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,15 +7,17 @@
//
//===----------------------------------------------------------------------===//
-#ifndef HEXAGONBITTRACKER_H
-#define HEXAGONBITTRACKER_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONBITTRACKER_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONBITTRACKER_H
#include "BitTracker.h"
#include "llvm/ADT/DenseMap.h"
+#include <cstdint>
namespace llvm {
- class HexagonInstrInfo;
- class HexagonRegisterInfo;
+
+class HexagonInstrInfo;
+class HexagonRegisterInfo;
struct HexagonEvaluator : public BitTracker::MachineEvaluator {
typedef BitTracker::CellMapType CellMapType;
@@ -49,10 +51,12 @@ private:
// Type of formal parameter extension.
struct ExtType {
enum { SExt, ZExt };
- char Type;
- uint16_t Width;
- ExtType() : Type(0), Width(0) {}
+
+ ExtType() = default;
ExtType(char t, uint16_t w) : Type(t), Width(w) {}
+
+ char Type = 0;
+ uint16_t Width = 0;
};
// Map VR -> extension type.
typedef DenseMap<unsigned, ExtType> RegExtMap;
@@ -61,4 +65,4 @@ private:
} // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONBITTRACKER_H
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 34ce3e652995..0a7dc6b49d00 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -11,26 +11,45 @@
//
//===----------------------------------------------------------------------===//
+#include "Hexagon.h"
#include "HexagonHazardRecognizer.h"
#include "HexagonInstrInfo.h"
#include "HexagonRegisterInfo.h"
#include "HexagonSubtarget.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/DFAPacketizer.h"
#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/BranchProbability.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
#include <cctype>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
using namespace llvm;
@@ -108,19 +127,16 @@ HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
: HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
RI() {}
-
static bool isIntRegForSubInst(unsigned Reg) {
return (Reg >= Hexagon::R0 && Reg <= Hexagon::R7) ||
(Reg >= Hexagon::R16 && Reg <= Hexagon::R23);
}
-
static bool isDblRegForSubInst(unsigned Reg, const HexagonRegisterInfo &HRI) {
return isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::isub_lo)) &&
isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::isub_hi));
}
-
/// Calculate number of instructions excluding the debug instructions.
static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
MachineBasicBlock::const_instr_iterator MIE) {
@@ -132,7 +148,6 @@ static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
return Count;
}
-
/// Find the hardware loop instruction used to set-up the specified loop.
/// On Hexagon, we have two instructions used to set-up the hardware loop
/// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions
@@ -164,17 +179,16 @@ static MachineInstr *findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
return &*I;
// We've reached a different loop, which means the loop0 has been removed.
if (Opc == EndLoopOp)
- return 0;
+ return nullptr;
}
// Check the predecessors for the LOOP instruction.
MachineInstr *loop = findLoopInstr(*PB, EndLoopOp, Visited);
if (loop)
return loop;
}
- return 0;
+ return nullptr;
}
-
/// Gather register def/uses from MI.
/// This treats possible (predicated) defs as actually happening ones
/// (conservatively).
@@ -201,7 +215,6 @@ static inline void parseOperands(const MachineInstr &MI,
}
}
-
// Position dependent, so check twice for swap.
static bool isDuplexPairMatch(unsigned Ga, unsigned Gb) {
switch (Ga) {
@@ -228,8 +241,6 @@ static bool isDuplexPairMatch(unsigned Ga, unsigned Gb) {
return false;
}
-
-
/// isLoadFromStackSlot - If the specified machine instruction is a direct
/// load from a stack slot, return the virtual or physical register number of
/// the destination along with the FrameIndex of the loaded stack slot. If
@@ -280,7 +291,6 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
return 0;
}
-
/// isStoreToStackSlot - If the specified machine instruction is a direct
/// store to a stack slot, return the virtual or physical register number of
/// the source reg along with the FrameIndex of the loaded stack slot. If
@@ -337,7 +347,6 @@ unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
return 0;
}
-
/// This function can analyze one/two way branching only and should (mostly) be
/// called by target independent side.
/// First entry is always the opcode of the branching instruction, except when
@@ -401,7 +410,7 @@ bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
// Delete the J2_jump if it's equivalent to a fall-through.
if (AllowModify && JumpToBlock &&
MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
- DEBUG(dbgs()<< "\nErasing the jump to successor block\n";);
+ DEBUG(dbgs() << "\nErasing the jump to successor block\n";);
I->eraseFromParent();
I = MBB.instr_end();
if (I == MBB.instr_begin())
@@ -415,7 +424,7 @@ bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
MachineInstr *LastInst = &*I;
MachineInstr *SecondLastInst = nullptr;
// Find one more terminator if present.
- for (;;) {
+ while (true) {
if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(*I)) {
if (!SecondLastInst)
SecondLastInst = &*I;
@@ -524,7 +533,6 @@ bool HexagonInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
return true;
}
-
unsigned HexagonInstrInfo::removeBranch(MachineBasicBlock &MBB,
int *BytesRemoved) const {
assert(!BytesRemoved && "code size not handled");
@@ -730,7 +738,6 @@ bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
return nonDbgBBSize(&MBB) <= 3;
}
-
bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
unsigned NumTCycles, unsigned ExtraTCycles, MachineBasicBlock &FMBB,
unsigned NumFCycles, unsigned ExtraFCycles, BranchProbability Probability)
@@ -738,7 +745,6 @@ bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
return nonDbgBBSize(&TMBB) <= 3 && nonDbgBBSize(&FMBB) <= 3;
}
-
bool HexagonInstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
unsigned NumInstrs, BranchProbability Probability) const {
return NumInstrs <= 4;
@@ -853,7 +859,6 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
llvm_unreachable("Unimplemented");
}
-
void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, unsigned SrcReg, bool isKill, int FI,
const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const {
@@ -976,7 +981,6 @@ void HexagonInstrInfo::loadRegFromStackSlot(
}
}
-
static void getLiveRegsAt(LivePhysRegs &Regs, const MachineInstr &MI) {
const MachineBasicBlock &B = *MI.getParent();
Regs.addLiveOuts(B);
@@ -1307,7 +1311,6 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return false;
}
-
// We indicate that we want to reverse the branch by
// inserting the reversed branching opcode.
bool HexagonInstrInfo::reverseBranchCondition(
@@ -1325,19 +1328,16 @@ bool HexagonInstrInfo::reverseBranchCondition(
return false;
}
-
void HexagonInstrInfo::insertNoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const {
DebugLoc DL;
BuildMI(MBB, MI, DL, get(Hexagon::A2_nop));
}
-
bool HexagonInstrInfo::isPostIncrement(const MachineInstr &MI) const {
return getAddrMode(MI) == HexagonII::PostInc;
}
-
// Returns true if an instruction is predicated irrespective of the predicate
// sense. For example, all of the following will return true.
// if (p0) R1 = add(R2, R3)
@@ -1351,7 +1351,6 @@ bool HexagonInstrInfo::isPredicated(const MachineInstr &MI) const {
return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask;
}
-
bool HexagonInstrInfo::PredicateInstruction(
MachineInstr &MI, ArrayRef<MachineOperand> Cond) const {
if (Cond.empty() || isNewValueJump(Cond[0].getImm()) ||
@@ -1403,14 +1402,12 @@ bool HexagonInstrInfo::PredicateInstruction(
return true;
}
-
bool HexagonInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
ArrayRef<MachineOperand> Pred2) const {
// TODO: Fix this
return false;
}
-
bool HexagonInstrInfo::DefinesPredicate(
MachineInstr &MI, std::vector<MachineOperand> &Pred) const {
auto &HRI = getRegisterInfo();
@@ -1427,7 +1424,6 @@ bool HexagonInstrInfo::DefinesPredicate(
return false;
}
-
bool HexagonInstrInfo::isPredicable(MachineInstr &MI) const {
return MI.getDesc().isPredicable();
}
@@ -1466,7 +1462,6 @@ bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
return false;
}
-
/// Measure the specified inline asm to determine an approximation of its
/// length.
/// Comments (which run till the next SeparatorString or newline) do not
@@ -1502,7 +1497,6 @@ unsigned HexagonInstrInfo::getInlineAsmLength(const char *Str,
return Length;
}
-
ScheduleHazardRecognizer*
HexagonInstrInfo::CreateTargetPostRAHazardRecognizer(
const InstrItineraryData *II, const ScheduleDAG *DAG) const {
@@ -1513,7 +1507,6 @@ HexagonInstrInfo::CreateTargetPostRAHazardRecognizer(
return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
}
-
/// \brief For a comparison instruction, return the source registers in
/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
/// compares against in CmpValue. Return true if the comparison instruction
@@ -1609,14 +1602,12 @@ unsigned HexagonInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
return getInstrTimingClassLatency(ItinData, MI);
}
-
DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState(
const TargetSubtargetInfo &STI) const {
const InstrItineraryData *II = STI.getInstrItineraryData();
return static_cast<const HexagonSubtarget&>(STI).createDFAPacketizer(II);
}
-
// Inspired by this pair:
// %R13<def> = L2_loadri_io %R29, 136; mem:LD4[FixedStack0]
// S2_storeri_io %R29, 132, %R1<kill>; flags: mem:ST4[FixedStack1]
@@ -1661,7 +1652,6 @@ bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
return false;
}
-
/// If the instruction is an increment of a constant value, return the amount.
bool HexagonInstrInfo::getIncrementValue(const MachineInstr &MI,
int &Value) const {
@@ -1677,7 +1667,6 @@ bool HexagonInstrInfo::getIncrementValue(const MachineInstr &MI,
return false;
}
-
unsigned HexagonInstrInfo::createVR(MachineFunction *MF, MVT VT) const {
MachineRegisterInfo &MRI = MF->getRegInfo();
const TargetRegisterClass *TRC;
@@ -1695,18 +1684,15 @@ unsigned HexagonInstrInfo::createVR(MachineFunction *MF, MVT VT) const {
return NewReg;
}
-
bool HexagonInstrInfo::isAbsoluteSet(const MachineInstr &MI) const {
return (getAddrMode(MI) == HexagonII::AbsoluteSet);
}
-
bool HexagonInstrInfo::isAccumulator(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
return((F >> HexagonII::AccumulatorPos) & HexagonII::AccumulatorMask);
}
-
bool HexagonInstrInfo::isComplex(const MachineInstr &MI) const {
const MachineFunction *MF = MI.getParent()->getParent();
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
@@ -1727,13 +1713,11 @@ bool HexagonInstrInfo::isComplex(const MachineInstr &MI) const {
return false;
}
-
// Return true if the instruction is a compund branch instruction.
bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr &MI) const {
return (getType(MI) == HexagonII::TypeCOMPOUND && MI.isBranch());
}
-
bool HexagonInstrInfo::isCondInst(const MachineInstr &MI) const {
return (MI.isBranch() && isPredicated(MI)) ||
isConditionalTransfer(MI) ||
@@ -1744,7 +1728,6 @@ bool HexagonInstrInfo::isCondInst(const MachineInstr &MI) const {
!isPredicatedNew(MI));
}
-
bool HexagonInstrInfo::isConditionalALU32(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case Hexagon::A2_paddf:
@@ -1802,7 +1785,6 @@ bool HexagonInstrInfo::isConditionalALU32(const MachineInstr &MI) const {
return false;
}
-
// FIXME - Function name and it's functionality don't match.
// It should be renamed to hasPredNewOpcode()
bool HexagonInstrInfo::isConditionalLoad(const MachineInstr &MI) const {
@@ -1814,7 +1796,6 @@ bool HexagonInstrInfo::isConditionalLoad(const MachineInstr &MI) const {
return PNewOpcode >= 0;
}
-
// Returns true if an instruction is a conditional store.
//
// Note: It doesn't include conditional new-value stores as they can't be
@@ -1872,7 +1853,6 @@ bool HexagonInstrInfo::isConditionalStore(const MachineInstr &MI) const {
}
}
-
bool HexagonInstrInfo::isConditionalTransfer(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case Hexagon::A2_tfrt:
@@ -1893,7 +1873,6 @@ bool HexagonInstrInfo::isConditionalTransfer(const MachineInstr &MI) const {
return false;
}
-
// TODO: In order to have isExtendable for fpimm/f32Ext, we need to handle
// isFPImm and later getFPImm as well.
bool HexagonInstrInfo::isConstExtended(const MachineInstr &MI) const {
@@ -1942,7 +1921,6 @@ bool HexagonInstrInfo::isConstExtended(const MachineInstr &MI) const {
return (ImmValue < MinValue || ImmValue > MaxValue);
}
-
bool HexagonInstrInfo::isDeallocRet(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case Hexagon::L4_return :
@@ -1957,7 +1935,6 @@ bool HexagonInstrInfo::isDeallocRet(const MachineInstr &MI) const {
return false;
}
-
// Return true when ConsMI uses a register defined by ProdMI.
bool HexagonInstrInfo::isDependent(const MachineInstr &ProdMI,
const MachineInstr &ConsMI) const {
@@ -1994,7 +1971,6 @@ bool HexagonInstrInfo::isDependent(const MachineInstr &ProdMI,
return false;
}
-
// Returns true if the instruction is alread a .cur.
bool HexagonInstrInfo::isDotCurInst(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
@@ -2007,7 +1983,6 @@ bool HexagonInstrInfo::isDotCurInst(const MachineInstr &MI) const {
return false;
}
-
// Returns true, if any one of the operands is a dot new
// insn, whether it is predicated dot new or register dot new.
bool HexagonInstrInfo::isDotNewInst(const MachineInstr &MI) const {
@@ -2017,7 +1992,6 @@ bool HexagonInstrInfo::isDotNewInst(const MachineInstr &MI) const {
return false;
}
-
/// Symmetrical. See if these two instructions are fit for duplex pair.
bool HexagonInstrInfo::isDuplexPair(const MachineInstr &MIa,
const MachineInstr &MIb) const {
@@ -2026,7 +2000,6 @@ bool HexagonInstrInfo::isDuplexPair(const MachineInstr &MIa,
return (isDuplexPairMatch(MIaG, MIbG) || isDuplexPairMatch(MIbG, MIaG));
}
-
bool HexagonInstrInfo::isEarlySourceInstr(const MachineInstr &MI) const {
if (MI.mayLoad() || MI.mayStore() || MI.isCompare())
return true;
@@ -2038,13 +2011,11 @@ bool HexagonInstrInfo::isEarlySourceInstr(const MachineInstr &MI) const {
return false;
}
-
bool HexagonInstrInfo::isEndLoopN(unsigned Opcode) const {
return (Opcode == Hexagon::ENDLOOP0 ||
Opcode == Hexagon::ENDLOOP1);
}
-
bool HexagonInstrInfo::isExpr(unsigned OpType) const {
switch(OpType) {
case MachineOperand::MO_MachineBasicBlock:
@@ -2059,7 +2030,6 @@ bool HexagonInstrInfo::isExpr(unsigned OpType) const {
}
}
-
bool HexagonInstrInfo::isExtendable(const MachineInstr &MI) const {
const MCInstrDesc &MID = MI.getDesc();
const uint64_t F = MID.TSFlags;
@@ -2079,7 +2049,6 @@ bool HexagonInstrInfo::isExtendable(const MachineInstr &MI) const {
return false;
}
-
// This returns true in two cases:
// - The OP code itself indicates that this is an extended instruction.
// - One of MOs has been marked with HMOTF_ConstExtended flag.
@@ -2098,14 +2067,12 @@ bool HexagonInstrInfo::isExtended(const MachineInstr &MI) const {
return false;
}
-
bool HexagonInstrInfo::isFloat(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
const uint64_t F = get(Opcode).TSFlags;
return (F >> HexagonII::FPPos) & HexagonII::FPMask;
}
-
// No V60 HVX VMEM with A_INDIRECT.
bool HexagonInstrInfo::isHVXMemWithAIndirect(const MachineInstr &I,
const MachineInstr &J) const {
@@ -2116,7 +2083,6 @@ bool HexagonInstrInfo::isHVXMemWithAIndirect(const MachineInstr &I,
return J.isIndirectBranch() || isIndirectCall(J) || isIndirectL4Return(J);
}
-
bool HexagonInstrInfo::isIndirectCall(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case Hexagon::J2_callr :
@@ -2128,7 +2094,6 @@ bool HexagonInstrInfo::isIndirectCall(const MachineInstr &MI) const {
return false;
}
-
bool HexagonInstrInfo::isIndirectL4Return(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case Hexagon::L4_return :
@@ -2143,7 +2108,6 @@ bool HexagonInstrInfo::isIndirectL4Return(const MachineInstr &MI) const {
return false;
}
-
bool HexagonInstrInfo::isJumpR(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case Hexagon::J2_jumpr :
@@ -2158,7 +2122,6 @@ bool HexagonInstrInfo::isJumpR(const MachineInstr &MI) const {
return false;
}
-
// Return true if a given MI can accommodate given offset.
// Use abs estimate as oppose to the exact number.
// TODO: This will need to be changed to use MC level
@@ -2203,7 +2166,6 @@ bool HexagonInstrInfo::isJumpWithinBranchRange(const MachineInstr &MI,
}
}
-
bool HexagonInstrInfo::isLateInstrFeedsEarlyInstr(const MachineInstr &LRMI,
const MachineInstr &ESMI) const {
bool isLate = isLateResultInstr(LRMI);
@@ -2222,7 +2184,6 @@ bool HexagonInstrInfo::isLateInstrFeedsEarlyInstr(const MachineInstr &LRMI,
return false;
}
-
bool HexagonInstrInfo::isLateResultInstr(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case TargetOpcode::EXTRACT_SUBREG:
@@ -2259,14 +2220,12 @@ bool HexagonInstrInfo::isLateResultInstr(const MachineInstr &MI) const {
return true;
}
-
bool HexagonInstrInfo::isLateSourceInstr(const MachineInstr &MI) const {
// Instructions with iclass A_CVI_VX and attribute A_CVI_LATE uses a multiply
// resource, but all operands can be received late like an ALU instruction.
return MI.getDesc().getSchedClass() == Hexagon::Sched::CVI_VX_LATE;
}
-
bool HexagonInstrInfo::isLoopN(const MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
return Opcode == Hexagon::J2_loop0i ||
@@ -2279,7 +2238,6 @@ bool HexagonInstrInfo::isLoopN(const MachineInstr &MI) const {
Opcode == Hexagon::J2_loop1rext;
}
-
bool HexagonInstrInfo::isMemOp(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default: return false;
@@ -2312,46 +2270,38 @@ bool HexagonInstrInfo::isMemOp(const MachineInstr &MI) const {
return false;
}
-
bool HexagonInstrInfo::isNewValue(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask;
}
-
bool HexagonInstrInfo::isNewValue(unsigned Opcode) const {
const uint64_t F = get(Opcode).TSFlags;
return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask;
}
-
bool HexagonInstrInfo::isNewValueInst(const MachineInstr &MI) const {
return isNewValueJump(MI) || isNewValueStore(MI);
}
-
bool HexagonInstrInfo::isNewValueJump(const MachineInstr &MI) const {
return isNewValue(MI) && MI.isBranch();
}
-
bool HexagonInstrInfo::isNewValueJump(unsigned Opcode) const {
return isNewValue(Opcode) && get(Opcode).isBranch() && isPredicated(Opcode);
}
-
bool HexagonInstrInfo::isNewValueStore(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask;
}
-
bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const {
const uint64_t F = get(Opcode).TSFlags;
return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask;
}
-
// Returns true if a particular operand is extendable for an instruction.
bool HexagonInstrInfo::isOperandExtended(const MachineInstr &MI,
unsigned OperandNum) const {
@@ -2360,28 +2310,24 @@ bool HexagonInstrInfo::isOperandExtended(const MachineInstr &MI,
== OperandNum;
}
-
bool HexagonInstrInfo::isPredicatedNew(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
assert(isPredicated(MI));
return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask;
}
-
bool HexagonInstrInfo::isPredicatedNew(unsigned Opcode) const {
const uint64_t F = get(Opcode).TSFlags;
assert(isPredicated(Opcode));
return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask;
}
-
bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
return !((F >> HexagonII::PredicatedFalsePos) &
HexagonII::PredicatedFalseMask);
}
-
bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const {
const uint64_t F = get(Opcode).TSFlags;
// Make sure that the instruction is predicated.
@@ -2390,19 +2336,16 @@ bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const {
HexagonII::PredicatedFalseMask);
}
-
bool HexagonInstrInfo::isPredicated(unsigned Opcode) const {
const uint64_t F = get(Opcode).TSFlags;
return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask;
}
-
bool HexagonInstrInfo::isPredicateLate(unsigned Opcode) const {
const uint64_t F = get(Opcode).TSFlags;
return ~(F >> HexagonII::PredicateLatePos) & HexagonII::PredicateLateMask;
}
-
bool HexagonInstrInfo::isPredictedTaken(unsigned Opcode) const {
const uint64_t F = get(Opcode).TSFlags;
assert(get(Opcode).isBranch() &&
@@ -2410,7 +2353,6 @@ bool HexagonInstrInfo::isPredictedTaken(unsigned Opcode) const {
return (F >> HexagonII::TakenPos) & HexagonII::TakenMask;
}
-
bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr &MI) const {
return MI.getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4 ||
MI.getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_EXT ||
@@ -2496,13 +2438,11 @@ bool HexagonInstrInfo::isSignExtendingLoad(const MachineInstr &MI) const {
}
}
-
bool HexagonInstrInfo::isSolo(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
return (F >> HexagonII::SoloPos) & HexagonII::SoloMask;
}
-
bool HexagonInstrInfo::isSpillPredRegOp(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case Hexagon::STriw_pred :
@@ -2513,7 +2453,6 @@ bool HexagonInstrInfo::isSpillPredRegOp(const MachineInstr &MI) const {
}
}
-
bool HexagonInstrInfo::isTailCall(const MachineInstr &MI) const {
if (!MI.isBranch())
return false;
@@ -2524,7 +2463,6 @@ bool HexagonInstrInfo::isTailCall(const MachineInstr &MI) const {
return false;
}
-
// Returns true when SU has a timing class TC1.
bool HexagonInstrInfo::isTC1(const MachineInstr &MI) const {
unsigned SchedClass = MI.getDesc().getSchedClass();
@@ -2544,7 +2482,6 @@ bool HexagonInstrInfo::isTC1(const MachineInstr &MI) const {
}
}
-
bool HexagonInstrInfo::isTC2(const MachineInstr &MI) const {
unsigned SchedClass = MI.getDesc().getSchedClass();
switch (SchedClass) {
@@ -2561,7 +2498,6 @@ bool HexagonInstrInfo::isTC2(const MachineInstr &MI) const {
}
}
-
bool HexagonInstrInfo::isTC2Early(const MachineInstr &MI) const {
unsigned SchedClass = MI.getDesc().getSchedClass();
switch (SchedClass) {
@@ -2582,13 +2518,11 @@ bool HexagonInstrInfo::isTC2Early(const MachineInstr &MI) const {
}
}
-
bool HexagonInstrInfo::isTC4x(const MachineInstr &MI) const {
unsigned SchedClass = MI.getDesc().getSchedClass();
return SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23;
}
-
// Schedule this ASAP.
bool HexagonInstrInfo::isToBeScheduledASAP(const MachineInstr &MI1,
const MachineInstr &MI2) const {
@@ -2608,13 +2542,11 @@ bool HexagonInstrInfo::isToBeScheduledASAP(const MachineInstr &MI1,
return false;
}
-
bool HexagonInstrInfo::isV60VectorInstruction(const MachineInstr &MI) const {
const uint64_t V = getType(MI);
return HexagonII::TypeCVI_FIRST <= V && V <= HexagonII::TypeCVI_LAST;
}
-
// Check if the Offset is a valid auto-inc imm by Load/Store Type.
//
bool HexagonInstrInfo::isValidAutoIncImm(const EVT VT, const int Offset) const {
@@ -2653,7 +2585,6 @@ bool HexagonInstrInfo::isValidAutoIncImm(const EVT VT, const int Offset) const {
llvm_unreachable("Not an auto-inc opc!");
}
-
bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
bool Extend) const {
// This function is to check whether the "Offset" is in the correct range of
@@ -2808,12 +2739,10 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
"Please define it in the above switch statement!");
}
-
bool HexagonInstrInfo::isVecAcc(const MachineInstr &MI) const {
return isV60VectorInstruction(MI) && isAccumulator(MI);
}
-
bool HexagonInstrInfo::isVecALU(const MachineInstr &MI) const {
const uint64_t F = get(MI.getOpcode()).TSFlags;
const uint64_t V = ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
@@ -2822,7 +2751,6 @@ bool HexagonInstrInfo::isVecALU(const MachineInstr &MI) const {
V == HexagonII::TypeCVI_VA_DV;
}
-
bool HexagonInstrInfo::isVecUsableNextPacket(const MachineInstr &ProdMI,
const MachineInstr &ConsMI) const {
if (EnableACCForwarding && isVecAcc(ProdMI) && isVecAcc(ConsMI))
@@ -2915,7 +2843,6 @@ bool HexagonInstrInfo::isZeroExtendingLoad(const MachineInstr &MI) const {
}
}
-
// Add latency to instruction.
bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr &MI1,
const MachineInstr &MI2) const {
@@ -2925,7 +2852,6 @@ bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr &MI1,
return false;
}
-
/// \brief Get the base register and byte offset of a load/store instr.
bool HexagonInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt,
unsigned &BaseReg, int64_t &Offset, const TargetRegisterInfo *TRI)
@@ -2937,7 +2863,6 @@ bool HexagonInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt,
return BaseReg != 0;
}
-
/// \brief Can these instructions execute at the same time in a bundle.
bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr &First,
const MachineInstr &Second) const {
@@ -2959,13 +2884,11 @@ bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr &First,
return false;
}
-
bool HexagonInstrInfo::doesNotReturn(const MachineInstr &CallMI) const {
unsigned Opc = CallMI.getOpcode();
return Opc == Hexagon::PS_call_nr || Opc == Hexagon::PS_callr_nr;
}
-
bool HexagonInstrInfo::hasEHLabel(const MachineBasicBlock *B) const {
for (auto &I : *B)
if (I.isEHLabel())
@@ -2973,7 +2896,6 @@ bool HexagonInstrInfo::hasEHLabel(const MachineBasicBlock *B) const {
return false;
}
-
// Returns true if an instruction can be converted into a non-extended
// equivalent instruction.
bool HexagonInstrInfo::hasNonExtEquivalent(const MachineInstr &MI) const {
@@ -3011,13 +2933,11 @@ bool HexagonInstrInfo::hasNonExtEquivalent(const MachineInstr &MI) const {
return false;
}
-
bool HexagonInstrInfo::hasPseudoInstrPair(const MachineInstr &MI) const {
return Hexagon::getRealHWInstr(MI.getOpcode(),
Hexagon::InstrType_Pseudo) >= 0;
}
-
bool HexagonInstrInfo::hasUncondBranch(const MachineBasicBlock *B)
const {
MachineBasicBlock::const_iterator I = B->getFirstTerminator(), E = B->end();
@@ -3029,7 +2949,6 @@ bool HexagonInstrInfo::hasUncondBranch(const MachineBasicBlock *B)
return false;
}
-
// Returns true, if a LD insn can be promoted to a cur load.
bool HexagonInstrInfo::mayBeCurLoad(const MachineInstr &MI) const {
auto &HST = MI.getParent()->getParent()->getSubtarget<HexagonSubtarget>();
@@ -3038,14 +2957,12 @@ bool HexagonInstrInfo::mayBeCurLoad(const MachineInstr &MI) const {
HST.hasV60TOps();
}
-
// Returns true, if a ST insn can be promoted to a new-value store.
bool HexagonInstrInfo::mayBeNewStore(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
return (F >> HexagonII::mayNVStorePos) & HexagonII::mayNVStoreMask;
}
-
bool HexagonInstrInfo::producesStall(const MachineInstr &ProdMI,
const MachineInstr &ConsMI) const {
// There is no stall when ProdMI is not a V60 vector.
@@ -3064,7 +2981,6 @@ bool HexagonInstrInfo::producesStall(const MachineInstr &ProdMI,
return true;
}
-
bool HexagonInstrInfo::producesStall(const MachineInstr &MI,
MachineBasicBlock::const_instr_iterator BII) const {
// There is no stall when I is not a V60 vector.
@@ -3091,7 +3007,6 @@ bool HexagonInstrInfo::producesStall(const MachineInstr &MI,
return false;
}
-
bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr &MI,
unsigned PredReg) const {
for (unsigned opNum = 0; opNum < MI.getNumOperands(); opNum++) {
@@ -3106,7 +3021,6 @@ bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr &MI,
return MI.getOpcode() != Hexagon::A4_tlbmatch;
}
-
bool HexagonInstrInfo::PredOpcodeHasJMP_c(unsigned Opcode) const {
return (Opcode == Hexagon::J2_jumpt) ||
(Opcode == Hexagon::J2_jumpf) ||
@@ -3116,25 +3030,21 @@ bool HexagonInstrInfo::PredOpcodeHasJMP_c(unsigned Opcode) const {
(Opcode == Hexagon::J2_jumpfnewpt);
}
-
bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const {
if (Cond.empty() || !isPredicated(Cond[0].getImm()))
return false;
return !isPredicatedTrue(Cond[0].getImm());
}
-
short HexagonInstrInfo::getAbsoluteForm(const MachineInstr &MI) const {
return Hexagon::getAbsoluteForm(MI.getOpcode());
}
-
unsigned HexagonInstrInfo::getAddrMode(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
return (F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask;
}
-
// Returns the base register in a memory access (load/store). The offset is
// returned in Offset and the access size is returned in AccessSize.
unsigned HexagonInstrInfo::getBaseAndOffset(const MachineInstr &MI,
@@ -3171,7 +3081,6 @@ unsigned HexagonInstrInfo::getBaseAndOffset(const MachineInstr &MI,
return MI.getOperand(basePos).getReg();
}
-
/// Return the position of the base and offset operands for this instruction.
bool HexagonInstrInfo::getBaseAndOffsetPosition(const MachineInstr &MI,
unsigned &BasePos, unsigned &OffsetPos) const {
@@ -3203,7 +3112,6 @@ bool HexagonInstrInfo::getBaseAndOffsetPosition(const MachineInstr &MI,
return true;
}
-
// Inserts branching instructions in reverse order of their occurrence.
// e.g. jump_t t1 (i1)
// jump t2 (i2)
@@ -3265,24 +3173,20 @@ SmallVector<MachineInstr*, 2> HexagonInstrInfo::getBranchingInstrs(
return Jumpers;
}
-
short HexagonInstrInfo::getBaseWithLongOffset(short Opcode) const {
if (Opcode < 0)
return -1;
return Hexagon::getBaseWithLongOffset(Opcode);
}
-
short HexagonInstrInfo::getBaseWithLongOffset(const MachineInstr &MI) const {
return Hexagon::getBaseWithLongOffset(MI.getOpcode());
}
-
short HexagonInstrInfo::getBaseWithRegOffset(const MachineInstr &MI) const {
return Hexagon::getBaseWithRegOffset(MI.getOpcode());
}
-
// Returns Operand Index for the constant extended instruction.
unsigned HexagonInstrInfo::getCExtOpNum(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
@@ -3379,7 +3283,6 @@ HexagonII::CompoundGroup HexagonInstrInfo::getCompoundCandidateGroup(
return HexagonII::HCG_None;
}
-
// Returns -1 when there is no opcode found.
unsigned HexagonInstrInfo::getCompoundOpcode(const MachineInstr &GA,
const MachineInstr &GB) const {
@@ -3398,7 +3301,6 @@ unsigned HexagonInstrInfo::getCompoundOpcode(const MachineInstr &GA,
return -1;
}
-
int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const {
enum Hexagon::PredSense inPredSense;
inPredSense = invertPredicate ? Hexagon::PredSense_false :
@@ -3410,7 +3312,6 @@ int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const {
llvm_unreachable("Unexpected predicable instruction");
}
-
// Return the cur value instruction for a given store.
int HexagonInstrInfo::getDotCurOp(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
@@ -3428,8 +3329,6 @@ int HexagonInstrInfo::getDotCurOp(const MachineInstr &MI) const {
return 0;
}
-
-
// The diagram below shows the steps involved in the conversion of a predicated
// store instruction to its .new predicated new-value form.
//
@@ -3509,7 +3408,6 @@ int HexagonInstrInfo::getDotCurOp(const MachineInstr &MI) const {
// promoted. Therefore, in case of dependence check failure (due to R5) during
// next iteration, it should be converted back to its most basic form.
-
// Return the new value instruction for a given store.
int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const {
int NVOpcode = Hexagon::getNewValueOpcode(MI.getOpcode());
@@ -3552,7 +3450,6 @@ int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const {
return 0;
}
-
// Returns the opcode to use when converting MI, which is a conditional jump,
// into a conditional instruction which uses the .new value of the predicate.
// We also use branch probabilities to add a hint to the jump.
@@ -3579,7 +3476,6 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI,
}
}
-
// Return .new predicate version for an instruction.
int HexagonInstrInfo::getDotNewPredOp(const MachineInstr &MI,
const MachineBranchProbabilityInfo *MBPI) const {
@@ -3599,7 +3495,6 @@ int HexagonInstrInfo::getDotNewPredOp(const MachineInstr &MI,
return 0;
}
-
int HexagonInstrInfo::getDotOldOp(const int opc) const {
int NewOp = opc;
if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
@@ -3615,7 +3510,6 @@ int HexagonInstrInfo::getDotOldOp(const int opc) const {
return NewOp;
}
-
// See if instruction could potentially be a duplex candidate.
// If so, return its group. Zero otherwise.
HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
@@ -3960,12 +3854,10 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
return HexagonII::HSIG_None;
}
-
short HexagonInstrInfo::getEquivalentHWInstr(const MachineInstr &MI) const {
return Hexagon::getRealHWInstr(MI.getOpcode(), Hexagon::InstrType_Real);
}
-
// Return first non-debug instruction in the basic block.
MachineInstr *HexagonInstrInfo::getFirstNonDbgInst(MachineBasicBlock *BB)
const {
@@ -3978,7 +3870,6 @@ MachineInstr *HexagonInstrInfo::getFirstNonDbgInst(MachineBasicBlock *BB)
return nullptr;
}
-
unsigned HexagonInstrInfo::getInstrTimingClassLatency(
const InstrItineraryData *ItinData, const MachineInstr &MI) const {
// Default to one cycle for no itinerary. However, an "empty" itinerary may
@@ -4000,7 +3891,6 @@ unsigned HexagonInstrInfo::getInstrTimingClassLatency(
return Latency;
}
-
// inverts the predication logic.
// p -> NotP
// NotP -> P
@@ -4013,7 +3903,6 @@ bool HexagonInstrInfo::getInvertedPredSense(
return true;
}
-
unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
int InvPredOpcode;
InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc)
@@ -4024,7 +3913,6 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
llvm_unreachable("Unexpected predicated instruction");
}
-
// Returns the max value that doesn't need to be extended.
int HexagonInstrInfo::getMaxValue(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
@@ -4039,13 +3927,11 @@ int HexagonInstrInfo::getMaxValue(const MachineInstr &MI) const {
return ~(-1U << bits);
}
-
unsigned HexagonInstrInfo::getMemAccessSize(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
return (F >> HexagonII::MemAccessSizePos) & HexagonII::MemAccesSizeMask;
}
-
// Returns the min value that doesn't need to be extended.
int HexagonInstrInfo::getMinValue(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
@@ -4060,7 +3946,6 @@ int HexagonInstrInfo::getMinValue(const MachineInstr &MI) const {
return 0;
}
-
// Returns opcode of the non-extended equivalent instruction.
short HexagonInstrInfo::getNonExtOpcode(const MachineInstr &MI) const {
// Check if the instruction has a register form that uses register in place
@@ -4086,7 +3971,6 @@ short HexagonInstrInfo::getNonExtOpcode(const MachineInstr &MI) const {
return -1;
}
-
bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond,
unsigned &PredReg, unsigned &PredRegPos, unsigned &PredRegFlags) const {
if (Cond.empty())
@@ -4107,17 +3991,14 @@ bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond,
return true;
}
-
short HexagonInstrInfo::getPseudoInstrPair(const MachineInstr &MI) const {
return Hexagon::getRealHWInstr(MI.getOpcode(), Hexagon::InstrType_Pseudo);
}
-
short HexagonInstrInfo::getRegForm(const MachineInstr &MI) const {
return Hexagon::getRegForm(MI.getOpcode());
}
-
// Return the number of bytes required to encode the instruction.
// Hexagon instructions are fixed length, 4 bytes, unless they
// use a constant extender, which requires another 4 bytes.
@@ -4156,13 +4037,11 @@ unsigned HexagonInstrInfo::getSize(const MachineInstr &MI) const {
return Size;
}
-
uint64_t HexagonInstrInfo::getType(const MachineInstr &MI) const {
const uint64_t F = MI.getDesc().TSFlags;
return (F >> HexagonII::TypePos) & HexagonII::TypeMask;
}
-
unsigned HexagonInstrInfo::getUnits(const MachineInstr &MI) const {
const TargetSubtargetInfo &ST = MI.getParent()->getParent()->getSubtarget();
const InstrItineraryData &II = *ST.getInstrItineraryData();
@@ -4171,19 +4050,16 @@ unsigned HexagonInstrInfo::getUnits(const MachineInstr &MI) const {
return IS.getUnits();
}
-
unsigned HexagonInstrInfo::getValidSubTargets(const unsigned Opcode) const {
const uint64_t F = get(Opcode).TSFlags;
return (F >> HexagonII::validSubTargetPos) & HexagonII::validSubTargetMask;
}
-
// Calculate size of the basic block without debug instructions.
unsigned HexagonInstrInfo::nonDbgBBSize(const MachineBasicBlock *BB) const {
return nonDbgMICount(BB->instr_begin(), BB->instr_end());
}
-
unsigned HexagonInstrInfo::nonDbgBundleSize(
MachineBasicBlock::const_iterator BundleHead) const {
assert(BundleHead->isBundle() && "Not a bundle header");
@@ -4192,7 +4068,6 @@ unsigned HexagonInstrInfo::nonDbgBundleSize(
return nonDbgMICount(++MII, getBundleEnd(BundleHead.getInstrIterator()));
}
-
/// immediateExtend - Changes the instruction in place to one using an immediate
/// extender.
void HexagonInstrInfo::immediateExtend(MachineInstr &MI) const {
@@ -4208,7 +4083,6 @@ void HexagonInstrInfo::immediateExtend(MachineInstr &MI) const {
MO.addTargetFlag(HexagonII::HMOTF_ConstExtended);
}
-
bool HexagonInstrInfo::invertAndChangeJumpTarget(
MachineInstr &MI, MachineBasicBlock *NewTarget) const {
DEBUG(dbgs() << "\n[invertAndChangeJumpTarget] to BB#"
@@ -4229,7 +4103,6 @@ bool HexagonInstrInfo::invertAndChangeJumpTarget(
return true;
}
-
void HexagonInstrInfo::genAllInsnTimingClasses(MachineFunction &MF) const {
/* +++ The code below is used to generate complete set of Hexagon Insn +++ */
MachineFunction::iterator A = MF.begin();
@@ -4248,7 +4121,6 @@ void HexagonInstrInfo::genAllInsnTimingClasses(MachineFunction &MF) const {
/* --- The code above is used to generate complete set of Hexagon Insn --- */
}
-
// inverts the predication logic.
// p -> NotP
// NotP -> P
@@ -4258,7 +4130,6 @@ bool HexagonInstrInfo::reversePredSense(MachineInstr &MI) const {
return true;
}
-
// Reverse the branch prediction.
unsigned HexagonInstrInfo::reversePrediction(unsigned Opcode) const {
int PredRevOpcode = -1;
@@ -4270,14 +4141,12 @@ unsigned HexagonInstrInfo::reversePrediction(unsigned Opcode) const {
return PredRevOpcode;
}
-
// TODO: Add more rigorous validation.
bool HexagonInstrInfo::validateBranchCond(const ArrayRef<MachineOperand> &Cond)
const {
return Cond.empty() || (Cond[0].isImm() && (Cond.size() != 1));
}
-
short HexagonInstrInfo::xformRegToImmOffset(const MachineInstr &MI) const {
return Hexagon::xformRegToImmOffset(MI.getOpcode());
}
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 2d184d1484e9..2358d4b7e4c0 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -16,9 +16,14 @@
#include "HexagonRegisterInfo.h"
#include "MCTargetDesc/HexagonBaseInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/Target/TargetInstrInfo.h"
+#include <cstdint>
+#include <vector>
#define GET_INSTRINFO_HEADER
#include "HexagonGenInstrInfo.inc"
@@ -29,9 +34,10 @@ struct EVT;
class HexagonSubtarget;
class HexagonInstrInfo : public HexagonGenInstrInfo {
- virtual void anchor();
const HexagonRegisterInfo RI;
+ virtual void anchor();
+
public:
explicit HexagonInstrInfo(HexagonSubtarget &ST);
@@ -260,7 +266,7 @@ public:
/// PredCost.
unsigned getInstrLatency(const InstrItineraryData *ItinData,
const MachineInstr &MI,
- unsigned *PredCost = 0) const override;
+ unsigned *PredCost = nullptr) const override;
/// Create machine specific model for scheduling.
DFAPacketizer *
@@ -378,7 +384,6 @@ public:
bool PredOpcodeHasJMP_c(unsigned Opcode) const;
bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
-
short getAbsoluteForm(const MachineInstr &MI) const;
unsigned getAddrMode(const MachineInstr &MI) const;
unsigned getBaseAndOffset(const MachineInstr &MI, int &Offset,
@@ -421,13 +426,11 @@ public:
unsigned getUnits(const MachineInstr &MI) const;
unsigned getValidSubTargets(const unsigned Opcode) const;
-
/// getInstrTimingClassLatency - Compute the instruction latency of a given
/// instruction using Timing Class information, if available.
unsigned nonDbgBBSize(const MachineBasicBlock *BB) const;
unsigned nonDbgBundleSize(MachineBasicBlock::const_iterator BundleHead) const;
-
void immediateExtend(MachineInstr &MI) const;
bool invertAndChangeJumpTarget(MachineInstr &MI,
MachineBasicBlock* NewTarget) const;
@@ -438,6 +441,6 @@ public:
short xformRegToImmOffset(const MachineInstr &MI) const;
};
-}
+} // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONINSTRINFO_H
diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
index 371b52108b9b..d83bcbc41553 100644
--- a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -15,33 +15,31 @@
namespace llvm {
- namespace Hexagon {
+namespace Hexagon {
+
const unsigned int StartPacket = 0x1;
const unsigned int EndPacket = 0x2;
- }
+} // end namespace Hexagon
/// Hexagon target-specific information for each MachineFunction.
class HexagonMachineFunctionInfo : public MachineFunctionInfo {
// SRetReturnReg - Some subtargets require that sret lowering includes
// returning the value of the returned struct in a register. This field
// holds the virtual register into which the sret argument is passed.
- unsigned SRetReturnReg;
- unsigned StackAlignBaseVReg; // Aligned-stack base register (virtual)
- unsigned StackAlignBasePhysReg; // (physical)
+ unsigned SRetReturnReg = 0;
+ unsigned StackAlignBaseVReg = 0; // Aligned-stack base register (virtual)
+ unsigned StackAlignBasePhysReg = 0; // (physical)
int VarArgsFrameIndex;
- bool HasClobberLR;
- bool HasEHReturn;
+ bool HasClobberLR = false;
+ bool HasEHReturn = false;
std::map<const MachineInstr*, unsigned> PacketInfo;
virtual void anchor();
public:
- HexagonMachineFunctionInfo() : SRetReturnReg(0), StackAlignBaseVReg(0),
- StackAlignBasePhysReg(0), HasClobberLR(0), HasEHReturn(false) {}
+ HexagonMachineFunctionInfo() = default;
- HexagonMachineFunctionInfo(MachineFunction &MF) : SRetReturnReg(0),
- StackAlignBaseVReg(0), StackAlignBasePhysReg(0), HasClobberLR(0),
- HasEHReturn(false) {}
+ HexagonMachineFunctionInfo(MachineFunction &MF) {}
unsigned getSRetReturnReg() const { return SRetReturnReg; }
void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
@@ -75,6 +73,7 @@ public:
void setStackAlignBasePhysReg(unsigned R) { StackAlignBasePhysReg = R; }
unsigned getStackAlignBasePhysReg() const { return StackAlignBasePhysReg; }
};
-} // End llvm namespace
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINEFUNCTIONINFO_H
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index e902f600e881..c9c4f95dbaaa 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -10,17 +10,27 @@
// This file contains the declarations of the HexagonTargetAsmInfo properties.
//
//===----------------------------------------------------------------------===//
+
#define DEBUG_TYPE "hexagon-sdata"
-#include "HexagonTargetMachine.h"
#include "HexagonTargetObjectFile.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Type.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/ELF.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -44,13 +54,21 @@ static cl::opt<bool> TraceGVPlacement("trace-gv-placement",
// (e.g. -debug and -debug-only=globallayout)
#define TRACE_TO(s, X) s << X
#ifdef NDEBUG
-#define TRACE(X) do { if (TraceGVPlacement) { TRACE_TO(errs(), X); } } while (0)
+#define TRACE(X) \
+ do { \
+ if (TraceGVPlacement) { \
+ TRACE_TO(errs(), X); \
+ } \
+ } while (false)
#else
-#define TRACE(X) \
- do { \
- if (TraceGVPlacement) { TRACE_TO(errs(), X); } \
- else { DEBUG( TRACE_TO(dbgs(), X) ); } \
- } while (0)
+#define TRACE(X) \
+ do { \
+ if (TraceGVPlacement) { \
+ TRACE_TO(errs(), X); \
+ } else { \
+ DEBUG(TRACE_TO(dbgs(), X)); \
+ } \
+ } while (false)
#endif
// Returns true if the section name is such that the symbol will be put
@@ -69,7 +87,6 @@ static bool isSmallDataSection(StringRef Sec) {
Sec.find(".scommon.") != StringRef::npos;
}
-
static const char *getSectionSuffixForSize(unsigned Size) {
switch (Size) {
default:
@@ -163,7 +180,6 @@ MCSection *HexagonTargetObjectFile::getExplicitSectionGlobal(
return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, Kind, TM);
}
-
/// Return true if this global value should be placed into small data/bss
/// section.
bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
@@ -232,17 +248,14 @@ bool HexagonTargetObjectFile::isGlobalInSmallSection(const GlobalObject *GO,
return true;
}
-
bool HexagonTargetObjectFile::isSmallDataEnabled() const {
return SmallDataThreshold > 0;
}
-
unsigned HexagonTargetObjectFile::getSmallDataSize() const {
return SmallDataThreshold;
}
-
/// Descends any type down to "elementary" components,
/// discovering the smallest addressable one.
/// If zero is returned, declaration will not be modified.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
index 5feaffe6efb9..9a09a17767a6 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -1,5 +1,4 @@
-
-//=== HexagonMCCompound.cpp - Hexagon Compound checker -------===//
+//=== HexagonMCCompound.cpp - Hexagon Compound checker -------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -11,18 +10,17 @@
// This file is looks at a packet and tries to form compound insns
//
//===----------------------------------------------------------------------===//
+
#include "Hexagon.h"
#include "MCTargetDesc/HexagonBaseInfo.h"
-#include "MCTargetDesc/HexagonMCShuffler.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/MC/MCAssembler.h"
+#include "MCTargetDesc/HexagonMCInstrInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
using namespace llvm;
using namespace Hexagon;
@@ -79,8 +77,7 @@ static const unsigned cmpgtn1BitOpcode[8] = {
};
// enum HexagonII::CompoundGroup
-namespace {
-unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
+static unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
unsigned DstReg, SrcReg, Src1Reg, Src2Reg;
switch (MI.getOpcode()) {
@@ -173,11 +170,9 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) {
return HexagonII::HCG_None;
}
-}
/// getCompoundOp - Return the index from 0-7 into the above opcode lists.
-namespace {
-unsigned getCompoundOp(MCInst const &HMCI) {
+static unsigned getCompoundOp(MCInst const &HMCI) {
const MCOperand &Predicate = HMCI.getOperand(0);
unsigned PredReg = Predicate.getReg();
@@ -198,11 +193,10 @@ unsigned getCompoundOp(MCInst const &HMCI) {
return (PredReg == Hexagon::P0) ? tp0_jump_t : tp1_jump_t;
}
}
-}
-namespace {
-MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
- MCInst *CompoundInsn = 0;
+static MCInst *getCompoundInsn(MCContext &Context, MCInst const &L,
+ MCInst const &R) {
+ MCInst *CompoundInsn = nullptr;
unsigned compoundOpcode;
MCOperand Rs, Rt;
int64_t Value;
@@ -336,12 +330,10 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) {
return CompoundInsn;
}
-}
/// Non-Symmetrical. See if these two instructions are fit for compound pair.
-namespace {
-bool isOrderedCompoundPair(MCInst const &MIa, bool IsExtendedA,
- MCInst const &MIb, bool IsExtendedB) {
+static bool isOrderedCompoundPair(MCInst const &MIa, bool IsExtendedA,
+ MCInst const &MIb, bool IsExtendedB) {
unsigned MIaG = getCompoundCandidateGroup(MIa, IsExtendedA);
unsigned MIbG = getCompoundCandidateGroup(MIb, IsExtendedB);
// We have two candidates - check that this is the same register
@@ -353,10 +345,9 @@ bool isOrderedCompoundPair(MCInst const &MIa, bool IsExtendedA,
return ((MIaG == HexagonII::HCG_A && MIbG == HexagonII::HCG_B) &&
(MIa.getOperand(0).getReg() == MIb.getOperand(0).getReg()));
}
-}
-namespace {
-bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) {
+static bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context,
+ MCInst &MCI) {
assert(HexagonMCInstrInfo::isBundle(MCI));
bool JExtended = false;
for (MCInst::iterator J =
@@ -367,8 +358,7 @@ bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) {
JExtended = true;
continue;
}
- if (llvm::HexagonMCInstrInfo::getType(MCII, *JumpInst) ==
- HexagonII::TypeJ) {
+ if (HexagonMCInstrInfo::getType(MCII, *JumpInst) == HexagonII::TypeJ) {
// Try to pair with another insn (B)undled with jump.
bool BExtended = false;
for (MCInst::iterator B =
@@ -401,7 +391,6 @@ bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) {
}
return false;
}
-}
/// tryCompound - Given a bundle check for compound insns when one
/// is found update the contents fo the bundle with the compound insn.
@@ -420,6 +409,4 @@ void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII,
// a compound is found.
while (lookForCompound(MCII, Context, MCI))
;
-
- return;
}
diff --git a/lib/Target/Hexagon/RDFCopy.h b/lib/Target/Hexagon/RDFCopy.h
index 517f17cc9c64..5ece11bd5ce4 100644
--- a/lib/Target/Hexagon/RDFCopy.h
+++ b/lib/Target/Hexagon/RDFCopy.h
@@ -1,4 +1,4 @@
-//===--- RDFCopy.h --------------------------------------------------------===//
+//===--- RDFCopy.h ----------------------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -7,23 +7,26 @@
//
//===----------------------------------------------------------------------===//
-#ifndef RDF_COPY_H
-#define RDF_COPY_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
+#define LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
#include "RDFGraph.h"
#include <map>
#include <vector>
namespace llvm {
+
class MachineBasicBlock;
class MachineDominatorTree;
class MachineInstr;
namespace rdf {
+
struct CopyPropagation {
CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg),
Trace(false) {}
- virtual ~CopyPropagation() {}
+
+ virtual ~CopyPropagation() = default;
bool run();
void trace(bool On) { Trace = On; }
@@ -49,7 +52,9 @@ namespace rdf {
void updateMap(NodeAddr<InstrNode*> IA);
bool scanBlock(MachineBasicBlock *B);
};
-} // namespace rdf
-} // namespace llvm
-#endif
+} // end namespace rdf
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
diff --git a/lib/Target/Hexagon/RDFGraph.cpp b/lib/Target/Hexagon/RDFGraph.cpp
index 33c3f03790f3..fa272ea1a76a 100644
--- a/lib/Target/Hexagon/RDFGraph.cpp
+++ b/lib/Target/Hexagon/RDFGraph.cpp
@@ -10,16 +10,31 @@
// Target-independent, SSA-based data flow graph for register data flow (RDF).
//
#include "RDFGraph.h"
-
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominanceFrontier.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <utility>
+#include <vector>
using namespace llvm;
using namespace rdf;
@@ -88,14 +103,12 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<NodeId> &P) {
return OS;
}
-namespace {
- void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA,
- const DataFlowGraph &G) {
- OS << Print<NodeId>(RA.Id, G) << '<'
- << Print<RegisterRef>(RA.Addr->getRegRef(G), G) << '>';
- if (RA.Addr->getFlags() & NodeAttrs::Fixed)
- OS << '!';
- }
+static void printRefHeader(raw_ostream &OS, const NodeAddr<RefNode*> RA,
+ const DataFlowGraph &G) {
+ OS << Print<NodeId>(RA.Id, G) << '<'
+ << Print<RegisterRef>(RA.Addr->getRegRef(G), G) << '>';
+ if (RA.Addr->getFlags() & NodeAttrs::Fixed)
+ OS << '!';
}
template<>
@@ -183,9 +196,11 @@ raw_ostream &operator<< (raw_ostream &OS, const Print<NodeSet> &P) {
}
namespace {
+
template <typename T>
struct PrintListV {
PrintListV(const NodeList &L, const DataFlowGraph &G) : List(L), G(G) {}
+
typedef T Type;
const NodeList &List;
const DataFlowGraph &G;
@@ -201,7 +216,8 @@ namespace {
}
return OS;
}
-}
+
+} // end anonymous namespace
template<>
raw_ostream &operator<< (raw_ostream &OS, const Print<NodeAddr<PhiNode*>> &P) {
@@ -219,10 +235,10 @@ raw_ostream &operator<< (raw_ostream &OS,
// Print the target for calls and branches (for readability).
if (MI.isCall() || MI.isBranch()) {
MachineInstr::const_mop_iterator T =
- find_if(MI.operands(),
- [] (const MachineOperand &Op) -> bool {
- return Op.isMBB() || Op.isGlobal() || Op.isSymbol();
- });
+ llvm::find_if(MI.operands(),
+ [] (const MachineOperand &Op) -> bool {
+ return Op.isMBB() || Op.isGlobal() || Op.isSymbol();
+ });
if (T != MI.operands_end()) {
OS << ' ';
if (T->isMBB())
@@ -327,8 +343,8 @@ raw_ostream &operator<< (raw_ostream &OS,
return OS;
}
-} // namespace rdf
-} // namespace llvm
+} // end namespace rdf
+} // end namespace llvm
// Node allocation functions.
//
@@ -390,7 +406,6 @@ void NodeAllocator::clear() {
ActiveEnd = nullptr;
}
-
// Insert node NA after "this" in the circular chain.
void NodeBase::append(NodeAddr<NodeBase*> NA) {
NodeId Nx = Next;
@@ -401,7 +416,6 @@ void NodeBase::append(NodeAddr<NodeBase*> NA) {
}
}
-
// Fundamental node manipulator functions.
// Obtain the register reference from a reference node.
@@ -590,7 +604,6 @@ NodeAddr<BlockNode*> FuncNode::getEntryBlock(const DataFlowGraph &G) {
return findBlock(EntryB, G);
}
-
// Target operand information.
//
@@ -641,7 +654,6 @@ bool TargetOperandInfo::isFixedReg(const MachineInstr &In, unsigned OpNum)
return false;
}
-
RegisterRef RegisterAggr::normalize(RegisterRef RR) const {
RegisterId SuperReg = RR.Reg;
while (true) {
@@ -745,7 +757,6 @@ void RegisterAggr::print(raw_ostream &OS) const {
OS << " }";
}
-
//
// The data flow graph construction.
//
@@ -753,10 +764,9 @@ void RegisterAggr::print(raw_ostream &OS) const {
DataFlowGraph::DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi)
- : LMI(), MF(mf), TII(tii), TRI(tri), MDT(mdt), MDF(mdf), TOI(toi) {
+ : MF(mf), TII(tii), TRI(tri), MDT(mdt), MDF(mdf), TOI(toi) {
}
-
// The implementation of the definition stack.
// Each register reference has its own definition stack. In particular,
// for a register references "Reg" and "Reg:subreg" will each have their
@@ -845,7 +855,6 @@ unsigned DataFlowGraph::DefStack::nextDown(unsigned P) const {
return P;
}
-
// Register information.
// Get the list of references aliased to RR. Lane masks are ignored.
@@ -915,7 +924,6 @@ NodeAddr<NodeBase*> DataFlowGraph::cloneNode(const NodeAddr<NodeBase*> B) {
return NA;
}
-
// Allocation routines for specific node types/kinds.
NodeAddr<UseNode*> DataFlowGraph::newUse(NodeAddr<InstrNode*> Owner,
@@ -1248,7 +1256,6 @@ bool DataFlowGraph::alias(RegisterRef RA, RegisterRef RB) const {
return false;
}
-
// Clear all information in the graph.
void DataFlowGraph::reset() {
Memory.clear();
@@ -1256,7 +1263,6 @@ void DataFlowGraph::reset() {
Func = NodeAddr<FuncNode*>();
}
-
// Return the next reference node in the instruction node IA that is related
// to RA. Conceptually, two reference nodes are related if they refer to the
// same instance of a register access, but differ in flags or other minor
diff --git a/lib/Target/Hexagon/RDFGraph.h b/lib/Target/Hexagon/RDFGraph.h
index 871062ff2b05..49d78a8b22b5 100644
--- a/lib/Target/Hexagon/RDFGraph.h
+++ b/lib/Target/Hexagon/RDFGraph.h
@@ -1,4 +1,4 @@
-//===--- RDFGraph.h -------------------------------------------------------===//
+//===--- RDFGraph.h ---------------------------------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -221,20 +221,25 @@
// The statement s5 has two use nodes for t0: u7" and u9". The quotation
// mark " indicates that the node is a shadow.
//
-#ifndef RDF_GRAPH_H
-#define RDF_GRAPH_H
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
+#define LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/LaneBitmask.h"
#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Timer.h"
#include "llvm/Target/TargetRegisterInfo.h"
-
+#include <cassert>
+#include <cstdint>
+#include <cstring>
#include <functional>
#include <map>
#include <set>
#include <unordered_map>
+#include <utility>
#include <vector>
// RDF uses uint32_t to refer to registers. This is to ensure that the type
@@ -243,6 +248,7 @@
static_assert(sizeof(uint32_t) == sizeof(unsigned), "Those should be equal");
namespace llvm {
+
class MachineBasicBlock;
class MachineFunction;
class MachineInstr;
@@ -252,6 +258,7 @@ namespace llvm {
class TargetInstrInfo;
namespace rdf {
+
typedef uint32_t NodeId;
typedef uint32_t RegisterId;
@@ -293,9 +300,11 @@ namespace rdf {
static uint16_t set_type(uint16_t A, uint16_t T) {
return (A & ~TypeMask) | T;
}
+
static uint16_t set_kind(uint16_t A, uint16_t K) {
return (A & ~KindMask) | K;
}
+
static uint16_t set_flags(uint16_t A, uint16_t F) {
return (A & ~FlagMask) | F;
}
@@ -326,9 +335,14 @@ namespace rdf {
};
template <typename T> struct NodeAddr {
- NodeAddr() : Addr(nullptr), Id(0) {}
+ NodeAddr() : Addr(nullptr) {}
NodeAddr(T A, NodeId I) : Addr(A), Id(I) {}
+ // Type cast (casting constructor). The reason for having this class
+ // instead of std::pair.
+ template <typename S> NodeAddr(const NodeAddr<S> &NA)
+ : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {}
+
bool operator== (const NodeAddr<T> &NA) const {
assert((Addr == NA.Addr) == (Id == NA.Id));
return Addr == NA.Addr;
@@ -336,13 +350,9 @@ namespace rdf {
bool operator!= (const NodeAddr<T> &NA) const {
return !operator==(NA);
}
- // Type cast (casting constructor). The reason for having this class
- // instead of std::pair.
- template <typename S> NodeAddr(const NodeAddr<S> &NA)
- : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {}
T Addr;
- NodeId Id;
+ NodeId Id = 0;
};
struct NodeBase;
@@ -366,17 +376,20 @@ namespace rdf {
struct NodeAllocator {
// Amount of storage for a single node.
enum { NodeMemSize = 32 };
+
NodeAllocator(uint32_t NPB = 4096)
: NodesPerBlock(NPB), BitsPerIndex(Log2_32(NPB)),
- IndexMask((1 << BitsPerIndex)-1), ActiveEnd(nullptr) {
+ IndexMask((1 << BitsPerIndex)-1) {
assert(isPowerOf2_32(NPB));
}
+
NodeBase *ptr(NodeId N) const {
uint32_t N1 = N-1;
uint32_t BlockN = N1 >> BitsPerIndex;
uint32_t Offset = (N1 & IndexMask) * NodeMemSize;
return reinterpret_cast<NodeBase*>(Blocks[BlockN]+Offset);
}
+
NodeId id(const NodeBase *P) const;
NodeAddr<NodeBase*> New();
void clear();
@@ -384,6 +397,7 @@ namespace rdf {
private:
void startNewBlock();
bool needNewBlock();
+
uint32_t makeId(uint32_t Block, uint32_t Index) const {
// Add 1 to the id, to avoid the id of 0, which is treated as "null".
return ((Block << BitsPerIndex) | Index) + 1;
@@ -392,7 +406,7 @@ namespace rdf {
const uint32_t NodesPerBlock;
const uint32_t BitsPerIndex;
const uint32_t IndexMask;
- char *ActiveEnd;
+ char *ActiveEnd = nullptr;
std::vector<char*> Blocks;
typedef BumpPtrAllocatorImpl<MallocAllocator, 65536> AllocatorTy;
AllocatorTy MemPool;
@@ -405,6 +419,7 @@ namespace rdf {
RegisterRef() : RegisterRef(0) {}
explicit RegisterRef(RegisterId R, LaneBitmask M = LaneBitmask::getAll())
: Reg(R), Mask(R != 0 ? M : LaneBitmask::getNone()) {}
+
operator bool() const { return Reg != 0 && Mask.any(); }
bool operator== (const RegisterRef &RR) const {
return Reg == RR.Reg && Mask == RR.Mask;
@@ -420,7 +435,8 @@ namespace rdf {
struct TargetOperandInfo {
TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {}
- virtual ~TargetOperandInfo() {}
+ virtual ~TargetOperandInfo() = default;
+
virtual bool isPreserving(const MachineInstr &In, unsigned OpNum) const;
virtual bool isClobbering(const MachineInstr &In, unsigned OpNum) const;
virtual bool isFixedReg(const MachineInstr &In, unsigned OpNum) const;
@@ -428,7 +444,6 @@ namespace rdf {
const TargetInstrInfo &TII;
};
-
// Packed register reference. Only used for storage.
struct PackedRegisterRef {
RegisterId Reg;
@@ -442,11 +457,13 @@ namespace rdf {
template <typename T, unsigned N = 32>
struct IndexedSet {
IndexedSet() : Map() { Map.reserve(N); }
+
T get(uint32_t Idx) const {
// Index Idx corresponds to Map[Idx-1].
assert(Idx != 0 && !Map.empty() && Idx-1 < Map.size());
return Map[Idx-1];
}
+
uint32_t insert(T Val) {
// Linear search.
auto F = llvm::find(Map, Val);
@@ -455,11 +472,13 @@ namespace rdf {
Map.push_back(Val);
return Map.size(); // Return actual_index + 1.
}
+
uint32_t find(T Val) const {
auto F = llvm::find(Map, Val);
assert(F != Map.end());
return F - Map.begin();
}
+
private:
std::vector<T> Map;
};
@@ -478,12 +497,14 @@ namespace rdf {
assert(LM.any());
return LM.all() ? 0 : find(LM);
}
+
PackedRegisterRef pack(RegisterRef RR) {
return { RR.Reg, getIndexForLaneMask(RR.Mask) };
}
PackedRegisterRef pack(RegisterRef RR) const {
return { RR.Reg, getIndexForLaneMask(RR.Mask) };
}
+
RegisterRef unpack(PackedRegisterRef PR) const {
return RegisterRef(PR.Reg, getLaneMaskForIndex(PR.MaskId));
}
@@ -491,11 +512,8 @@ namespace rdf {
struct RegisterAggr {
RegisterAggr(const TargetRegisterInfo &tri)
- : Masks(), ExpAliasUnits(tri.getNumRegUnits()), CheckUnits(false),
- TRI(tri) {}
- RegisterAggr(const RegisterAggr &RG)
- : Masks(RG.Masks), ExpAliasUnits(RG.ExpAliasUnits),
- CheckUnits(RG.CheckUnits), TRI(RG.TRI) {}
+ : ExpAliasUnits(tri.getNumRegUnits()), CheckUnits(false), TRI(tri) {}
+ RegisterAggr(const RegisterAggr &RG) = default;
bool empty() const { return Masks.empty(); }
bool hasAliasOf(RegisterRef RR) const;
@@ -530,11 +548,11 @@ namespace rdf {
const TargetRegisterInfo &TRI;
};
-
struct NodeBase {
public:
// Make sure this is a POD.
NodeBase() = default;
+
uint16_t getType() const { return NodeAttrs::type(Attrs); }
uint16_t getKind() const { return NodeAttrs::kind(Attrs); }
uint16_t getFlags() const { return NodeAttrs::flags(Attrs); }
@@ -596,29 +614,36 @@ namespace rdf {
struct RefNode : public NodeBase {
RefNode() = default;
+
RegisterRef getRegRef(const DataFlowGraph &G) const;
+
MachineOperand &getOp() {
assert(!(getFlags() & NodeAttrs::PhiRef));
return *Ref.Op;
}
+
void setRegRef(RegisterRef RR, DataFlowGraph &G);
void setRegRef(MachineOperand *Op, DataFlowGraph &G);
+
NodeId getReachingDef() const {
return Ref.RD;
}
void setReachingDef(NodeId RD) {
Ref.RD = RD;
}
+
NodeId getSibling() const {
return Ref.Sib;
}
void setSibling(NodeId Sib) {
Ref.Sib = Sib;
}
+
bool isUse() const {
assert(getType() == NodeAttrs::Ref);
return getKind() == NodeAttrs::Use;
}
+
bool isDef() const {
assert(getType() == NodeAttrs::Ref);
return getKind() == NodeAttrs::Def;
@@ -702,6 +727,7 @@ namespace rdf {
MachineBasicBlock *getCode() const {
return CodeNode::getCode<MachineBasicBlock*>();
}
+
void addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G);
};
@@ -709,6 +735,7 @@ namespace rdf {
MachineFunction *getCode() const {
return CodeNode::getCode<MachineFunction*>();
}
+
NodeAddr<BlockNode*> findBlock(const MachineBasicBlock *BB,
const DataFlowGraph &G) const;
NodeAddr<BlockNode*> getEntryBlock(const DataFlowGraph &G);
@@ -723,6 +750,7 @@ namespace rdf {
template <typename T> T ptr(NodeId N) const {
return static_cast<T>(ptr(N));
}
+
NodeId id(const NodeBase *P) const;
template <typename T> NodeAddr<T> addr(NodeId N) const {
@@ -738,13 +766,17 @@ namespace rdf {
struct DefStack {
DefStack() = default;
+
bool empty() const { return Stack.empty() || top() == bottom(); }
+
private:
typedef NodeAddr<DefNode*> value_type;
struct Iterator {
typedef DefStack::value_type value_type;
+
Iterator &up() { Pos = DS.nextUp(Pos); return *this; }
Iterator &down() { Pos = DS.nextDown(Pos); return *this; }
+
value_type operator*() const {
assert(Pos >= 1);
return DS.Stack[Pos-1];
@@ -755,14 +787,17 @@ namespace rdf {
}
bool operator==(const Iterator &It) const { return Pos == It.Pos; }
bool operator!=(const Iterator &It) const { return Pos != It.Pos; }
+
private:
Iterator(const DefStack &S, bool Top);
+
// Pos-1 is the index in the StorageType object that corresponds to
// the top of the DefStack.
const DefStack &DS;
unsigned Pos;
friend struct DefStack;
};
+
public:
typedef Iterator iterator;
iterator top() const { return Iterator(*this, true); }
@@ -773,14 +808,18 @@ namespace rdf {
void pop();
void start_block(NodeId N);
void clear_block(NodeId N);
+
private:
friend struct Iterator;
typedef std::vector<value_type> StorageType;
+
bool isDelimiter(const StorageType::value_type &P, NodeId N = 0) const {
return (P.Addr == nullptr) && (N == 0 || P.Id == N);
}
+
unsigned nextUp(unsigned P) const;
unsigned nextDown(unsigned P) const;
+
StorageType Stack;
};
@@ -819,6 +858,7 @@ namespace rdf {
if (RemoveFromOwner)
removeFromOwner(UA);
}
+
void unlinkDef(NodeAddr<DefNode*> DA, bool RemoveFromOwner) {
unlinkDefDF(DA);
if (RemoveFromOwner)
@@ -831,23 +871,28 @@ namespace rdf {
return BA.Addr->getType() == NodeAttrs::Ref &&
BA.Addr->getKind() == Kind;
}
+
template <uint16_t Kind>
static bool IsCode(const NodeAddr<NodeBase*> BA) {
return BA.Addr->getType() == NodeAttrs::Code &&
BA.Addr->getKind() == Kind;
}
+
static bool IsDef(const NodeAddr<NodeBase*> BA) {
return BA.Addr->getType() == NodeAttrs::Ref &&
BA.Addr->getKind() == NodeAttrs::Def;
}
+
static bool IsUse(const NodeAddr<NodeBase*> BA) {
return BA.Addr->getType() == NodeAttrs::Ref &&
BA.Addr->getKind() == NodeAttrs::Use;
}
+
static bool IsPhi(const NodeAddr<NodeBase*> BA) {
return BA.Addr->getType() == NodeAttrs::Code &&
BA.Addr->getKind() == NodeAttrs::Phi;
}
+
static bool IsPreservingDef(const NodeAddr<DefNode*> DA) {
uint16_t Flags = DA.Addr->getFlags();
return (Flags & NodeAttrs::Preserving) && !(Flags & NodeAttrs::Undef);
@@ -902,6 +947,7 @@ namespace rdf {
void unlinkUseDF(NodeAddr<UseNode*> UA);
void unlinkDefDF(NodeAddr<DefNode*> DA);
+
void removeFromOwner(NodeAddr<RefNode*> RA) {
NodeAddr<InstrNode*> IA = RA.Addr->getOwner(*this);
IA.Addr->removeMember(RA, *this);
@@ -967,7 +1013,6 @@ namespace rdf {
return MM;
}
-
// Optionally print the lane mask, if it is not ~0.
struct PrintLaneMaskOpt {
PrintLaneMaskOpt(LaneBitmask M) : Mask(M) {}
@@ -991,7 +1036,9 @@ namespace rdf {
PrintNode(const NodeAddr<T> &x, const DataFlowGraph &g)
: Print<NodeAddr<T>>(x, g) {}
};
-} // namespace rdf
-} // namespace llvm
-#endif // RDF_GRAPH_H
+} // end namespace rdf
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 6f0fdddd7d55..92d3c001df94 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -28,6 +28,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
@@ -43,6 +44,11 @@ bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
return MipsDAGToDAGISel::runOnMachineFunction(MF);
}
+void MipsSEDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ SelectionDAGISel::getAnalysisUsage(AU);
+}
+
void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
MachineFunction &MF) {
MachineInstrBuilder MIB(MF, &MI);
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index 2a8e5877e848..f89a350cab04 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -28,6 +28,8 @@ private:
bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
void addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
MachineFunction &MF);
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index aa3ffde24b99..2b9195b095e1 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -3981,40 +3981,46 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
static bool isFunctionGlobalAddress(SDValue Callee);
static bool
-resideInSameModule(SDValue Callee, Reloc::Model RelMod) {
+resideInSameSection(const Function *Caller, SDValue Callee,
+ const TargetMachine &TM) {
// If !G, Callee can be an external symbol.
GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
- if (!G) return false;
+ if (!G)
+ return false;
const GlobalValue *GV = G->getGlobal();
-
- if (GV->isDeclaration()) return false;
-
- switch(GV->getLinkage()) {
- default: llvm_unreachable("unknow linkage type");
- case GlobalValue::AvailableExternallyLinkage:
- case GlobalValue::ExternalWeakLinkage:
+ if (!GV->isStrongDefinitionForLinker())
return false;
- // Callee with weak linkage is allowed if it has hidden or protected
- // visibility
- case GlobalValue::LinkOnceAnyLinkage:
- case GlobalValue::LinkOnceODRLinkage: // e.g. c++ inline functions
- case GlobalValue::WeakAnyLinkage:
- case GlobalValue::WeakODRLinkage: // e.g. c++ template instantiation
- if (GV->hasDefaultVisibility())
+ // Any explicitly-specified sections and section prefixes must also match.
+ // Also, if we're using -ffunction-sections, then each function is always in
+ // a different section (the same is true for COMDAT functions).
+ if (TM.getFunctionSections() || GV->hasComdat() || Caller->hasComdat() ||
+ GV->getSection() != Caller->getSection())
+ return false;
+ if (const auto *F = dyn_cast<Function>(GV)) {
+ if (F->getSectionPrefix() != Caller->getSectionPrefix())
return false;
-
- case GlobalValue::ExternalLinkage:
- case GlobalValue::InternalLinkage:
- case GlobalValue::PrivateLinkage:
- break;
}
- // With '-fPIC', calling default visiblity function need insert 'nop' after
- // function call, no matter that function resides in same module or not, so
- // we treat it as in different module.
- if (RelMod == Reloc::PIC_ && GV->hasDefaultVisibility())
+ // If the callee might be interposed, then we can't assume the ultimate call
+ // target will be in the same section. Even in cases where we can assume that
+ // interposition won't happen, in any case where the linker might insert a
+ // stub to allow for interposition, we must generate code as though
+ // interposition might occur. To understand why this matters, consider a
+ // situation where: a -> b -> c where the arrows indicate calls. b and c are
+ // in the same section, but a is in a different module (i.e. has a different
+ // TOC base pointer). If the linker allows for interposition between b and c,
+ // then it will generate a stub for the call edge between b and c which will
+ // save the TOC pointer into the designated stack slot allocated by b. If we
+ // return true here, and therefore allow a tail call between b and c, that
+ // stack slot won't exist and the b -> c stub will end up saving b'c TOC base
+ // pointer into the stack slot allocated by a (where the a -> b stub saved
+ // a's TOC base pointer). If we're not considering a tail call, but rather,
+ // whether a nop is needed after the call instruction in b, because the linker
+ // will insert a stub, it might complain about a missing nop if we omit it
+ // (although many don't complain in this case).
+ if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))
return false;
return true;
@@ -4130,11 +4136,11 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
!isa<ExternalSymbolSDNode>(Callee))
return false;
- // Check if Callee resides in the same module, because for now, PPC64 SVR4 ABI
- // (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another
- // module.
+ // Check if Callee resides in the same section, because for now, PPC64 SVR4
+ // ABI (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another
+ // section.
// ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
- if (!resideInSameModule(Callee, getTargetMachine().getRelocationModel()))
+ if (!resideInSameSection(MF.getFunction(), Callee, getTargetMachine()))
return false;
// TCO allows altering callee ABI, so we don't have to check further.
@@ -4592,14 +4598,6 @@ PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag, SDValue &Chain,
return CallOpc;
}
-static
-bool isLocalCall(const SDValue &Callee)
-{
- if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
- return G->getGlobal()->isStrongDefinitionForLinker();
- return false;
-}
-
SDValue PPCTargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -4701,6 +4699,7 @@ SDValue PPCTargetLowering::FinishCall(
// stack frame. If caller and callee belong to the same module (and have the
// same TOC), the NOP will remain unchanged.
+ MachineFunction &MF = DAG.getMachineFunction();
if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64() &&
!isPatchPoint) {
if (CallOpc == PPCISD::BCTRL) {
@@ -4724,11 +4723,11 @@ SDValue PPCTargetLowering::FinishCall(
// The address needs to go after the chain input but before the flag (or
// any other variadic arguments).
Ops.insert(std::next(Ops.begin()), AddTOC);
- } else if ((CallOpc == PPCISD::CALL) &&
- (!isLocalCall(Callee) ||
- DAG.getTarget().getRelocationModel() == Reloc::PIC_))
+ } else if (CallOpc == PPCISD::CALL &&
+ !resideInSameSection(MF.getFunction(), Callee, DAG.getTarget())) {
// Otherwise insert NOP for non-local calls.
CallOpc = PPCISD::CALL_NOP;
+ }
}
Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index d42e1187ce64..e1825ca1eda1 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -70,7 +70,7 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
EmitFunctionBody();
// Emit the XRay table for this function.
- EmitXRayTable();
+ emitXRayTable();
// We didn't modify anything.
return false;
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 1deefe1231ca..cd690442bb9f 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -373,6 +373,10 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
: std::next(MBBI);
+ PI = skipDebugInstructionsBackward(PI, MBB.begin());
+ if (NI != nullptr)
+ NI = skipDebugInstructionsForward(NI, MBB.end());
+
unsigned Opc = PI->getOpcode();
int Offset = 0;
@@ -2586,6 +2590,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
I = MBB.erase(I);
+ auto InsertPos = skipDebugInstructionsForward(I, MBB.end());
if (!reserveCallFrame) {
// If the stack pointer can be changed after prologue, turn the
@@ -2615,7 +2620,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
if (HasDwarfEHHandlers && !isDestroy &&
MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
- BuildCFI(MBB, I, DL,
+ BuildCFI(MBB, InsertPos, DL,
MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
if (Amount == 0)
@@ -2629,7 +2634,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// If this is a callee-pop calling convention, emit a CFA adjust for
// the amount the callee popped.
if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF))
- BuildCFI(MBB, I, DL,
+ BuildCFI(MBB, InsertPos, DL,
MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
// Add Amount to SP to destroy a frame, or subtract to setup.
@@ -2640,13 +2645,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// Merge with any previous or following adjustment instruction. Note: the
// instructions merged with here do not have CFI, so their stack
// adjustments do not feed into CfaAdjustment.
- StackAdjustment += mergeSPUpdates(MBB, I, true);
- StackAdjustment += mergeSPUpdates(MBB, I, false);
+ StackAdjustment += mergeSPUpdates(MBB, InsertPos, true);
+ StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);
if (StackAdjustment) {
if (!(Fn->optForMinSize() &&
- adjustStackWithPops(MBB, I, DL, StackAdjustment)))
- BuildStackAdjustment(MBB, I, DL, StackAdjustment,
+ adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment)))
+ BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment,
/*InEpilogue=*/false);
}
}
@@ -2662,8 +2667,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
// TODO: When not using precise CFA, we also need to adjust for the
// InternalAmt here.
if (CfaAdjustment) {
- BuildCFI(MBB, I, DL, MCCFIInstruction::createAdjustCfaOffset(
- nullptr, CfaAdjustment));
+ BuildCFI(MBB, InsertPos, DL,
+ MCCFIInstruction::createAdjustCfaOffset(nullptr,
+ CfaAdjustment));
}
}
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b293dfa98f82..fd2189397279 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -11474,6 +11474,10 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
const SmallBitVector &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
+ SmallVector<int, 4> WidenedMask;
+ if (!canWidenShuffleElements(Mask, WidenedMask))
+ return SDValue();
+
// TODO: If minimizing size and one of the inputs is a zero vector and the
// the zero vector has only one use, we could use a VPERM2X128 to save the
// instruction bytes needed to explicitly generate the zero vector.
@@ -11521,15 +11525,10 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
// [6] - ignore
// [7] - zero high half of destination
- int MaskLO = Mask[0];
- if (MaskLO == SM_SentinelUndef)
- MaskLO = Mask[1] == SM_SentinelUndef ? 0 : Mask[1];
-
- int MaskHI = Mask[2];
- if (MaskHI == SM_SentinelUndef)
- MaskHI = Mask[3] == SM_SentinelUndef ? 0 : Mask[3];
+ int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
+ int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
- unsigned PermMask = MaskLO / 2 | (MaskHI / 2) << 4;
+ unsigned PermMask = MaskLO | (MaskHI << 4);
// If either input is a zero vector, replace it with an undef input.
// Shuffle mask values < 4 are selecting elements of V1.
@@ -11538,16 +11537,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
// selecting the zero vector and setting the zero mask bit.
if (IsV1Zero) {
V1 = DAG.getUNDEF(VT);
- if (MaskLO < 4)
+ if (MaskLO < 2)
PermMask = (PermMask & 0xf0) | 0x08;
- if (MaskHI < 4)
+ if (MaskHI < 2)
PermMask = (PermMask & 0x0f) | 0x80;
}
if (IsV2Zero) {
V2 = DAG.getUNDEF(VT);
- if (MaskLO >= 4)
+ if (MaskLO >= 2)
PermMask = (PermMask & 0xf0) | 0x08;
- if (MaskHI >= 4)
+ if (MaskHI >= 2)
PermMask = (PermMask & 0x0f) | 0x80;
}
@@ -12012,11 +12011,9 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
- SmallVector<int, 4> WidenedMask;
- if (canWidenShuffleElements(Mask, WidenedMask))
- if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
- return V;
+ if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return V;
if (V2.isUndef()) {
// Check for being able to broadcast a single element.
@@ -12107,11 +12104,9 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
- SmallVector<int, 4> WidenedMask;
- if (canWidenShuffleElements(Mask, WidenedMask))
- if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
- Zeroable, Subtarget, DAG))
- return V;
+ if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return V;
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -12605,33 +12600,72 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
if (!canWidenShuffleElements(Mask, WidenedMask))
return SDValue();
+ // Check for patterns which can be matched with a single insert of a 256-bit
+ // subvector.
+ bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
+ {0, 1, 2, 3, 0, 1, 2, 3});
+ if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
+ {0, 1, 2, 3, 8, 9, 10, 11})) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
+ SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
+ OnlyUsesV1 ? V1 : V2,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
+ }
+
+ assert(WidenedMask.size() == 4);
+
+ // See if this is an insertion of the lower 128-bits of V2 into V1.
+ bool IsInsert = true;
+ int V2Index = -1;
+ for (int i = 0; i < 4; ++i) {
+ assert(WidenedMask[i] >= -1);
+ if (WidenedMask[i] < 0)
+ continue;
+
+ // Make sure all V1 subvectors are in place.
+ if (WidenedMask[i] < 4) {
+ if (WidenedMask[i] != i) {
+ IsInsert = false;
+ break;
+ }
+ } else {
+ // Make sure we only have a single V2 index and its the lowest 128-bits.
+ if (V2Index >= 0 || WidenedMask[i] != 4) {
+ IsInsert = false;
+ break;
+ }
+ V2Index = i;
+ }
+ }
+ if (IsInsert && V2Index >= 0) {
+ MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
+ SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
+ DAG.getIntPtrConstant(0, DL));
+ return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
+ }
+
+ // Try to lower to to vshuf64x2/vshuf32x4.
SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
+ unsigned PermMask = 0;
// Insure elements came from the same Op.
- int MaxOp1Index = VT.getVectorNumElements()/2 - 1;
- for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
- if (WidenedMask[i] == SM_SentinelZero)
- return SDValue();
- if (WidenedMask[i] == SM_SentinelUndef)
+ for (int i = 0; i < 4; ++i) {
+ assert(WidenedMask[i] >= -1);
+ if (WidenedMask[i] < 0)
continue;
- SDValue Op = WidenedMask[i] > MaxOp1Index ? V2 : V1;
- unsigned OpIndex = (i < Size/2) ? 0 : 1;
+ SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
+ unsigned OpIndex = i / 2;
if (Ops[OpIndex].isUndef())
Ops[OpIndex] = Op;
else if (Ops[OpIndex] != Op)
return SDValue();
- }
- // Form a 128-bit permutation.
- // Convert the 64-bit shuffle mask selection values into 128-bit selection
- // bits defined by a vshuf64x2 instruction's immediate control byte.
- unsigned PermMask = 0, Imm = 0;
- unsigned ControlBitsNum = WidenedMask.size() / 2;
-
- for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
- // Use first element in place of undef mask.
- Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
- PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
+ // Convert the 128-bit shuffle mask selection values into 128-bit selection
+ // bits defined by a vshuf64x2 instruction's immediate control byte.
+ PermMask |= (WidenedMask[i] % 4) << (i * 2);
}
return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
@@ -13051,10 +13085,10 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
int NumElements = Mask.size();
- int NumV1Elements = 0, NumV2Elements = 0, NumSentinelElements = 0;
+ int NumV1Elements = 0, NumV2Elements = 0;
for (int M : Mask)
if (M < 0)
- ++NumSentinelElements;
+ continue;
else if (M < NumElements)
++NumV1Elements;
else
@@ -18660,8 +18694,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
Mask, PassThru, Subtarget, DAG);
}
case INTR_TYPE_3OP_IMM8_MASK:
- case INTR_TYPE_3OP_MASK:
- case INSERT_SUBVEC: {
+ case INTR_TYPE_3OP_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
@@ -18670,13 +18703,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
- else if (IntrData->Type == INSERT_SUBVEC) {
- // imm should be adapted to ISD::INSERT_SUBVECTOR behavior
- assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!");
- unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue();
- Imm *= Src2.getSimpleValueType().getVectorNumElements();
- Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32);
- }
// We specify 2 possible opcodes for intrinsics with rounding modes.
// First, we check if the intrinsic may have non-default rounding mode,
@@ -28693,6 +28719,29 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
Op.getOperand(2));
}
+ case ISD::INSERT_SUBVECTOR: {
+ unsigned EltSize = EltVT.getSizeInBits();
+ if (EltSize != 32 && EltSize != 64)
+ return false;
+ MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
+ // Only change element size, not type.
+ if (VT.isInteger() != OpEltVT.isInteger())
+ return false;
+ uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
+ SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
+ DCI.AddToWorklist(Op0.getNode());
+ // Op1 needs to be bitcasted to a smaller vector with the same element type.
+ SDValue Op1 = Op.getOperand(1);
+ MVT Op1VT = MVT::getVectorVT(EltVT,
+ Op1.getSimpleValueType().getSizeInBits() / EltSize);
+ Op1 = DAG.getBitcast(Op1VT, Op1);
+ DCI.AddToWorklist(Op1.getNode());
+ DCI.CombineTo(OrigOp.getNode(),
+ DAG.getNode(Opcode, DL, VT, Op0, Op1,
+ DAG.getConstant(Imm, DL, MVT::i8)));
+ return true;
+ }
}
return false;
@@ -31784,6 +31833,83 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
+/// the codegen.
+/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
+static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SDLoc &DL) {
+ assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
+ SDValue Src = N->getOperand(0);
+ unsigned Opcode = Src.getOpcode();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = Src.getValueType();
+
+ auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
+ // TODO: Add extra cases where we can truncate both inputs for the
+ // cost of one (or none).
+ // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
+ if (Op0 == Op1)
+ return true;
+
+ SDValue BC0 = peekThroughOneUseBitcasts(Op0);
+ SDValue BC1 = peekThroughOneUseBitcasts(Op1);
+ return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
+ ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
+ };
+
+ auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
+ SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
+ SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
+ return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
+ };
+
+ // Don't combine if the operation has other uses.
+ if (!N->isOnlyUserOf(Src.getNode()))
+ return SDValue();
+
+ // Only support vector truncation for now.
+ // TODO: i64 scalar math would benefit as well.
+ if (!VT.isVector())
+ return SDValue();
+
+ // In most cases its only worth pre-truncating if we're only facing the cost
+ // of one truncation.
+ // i.e. if one of the inputs will constant fold or the input is repeated.
+ switch (Opcode) {
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR: {
+ SDValue Op0 = Src.getOperand(0);
+ SDValue Op1 = Src.getOperand(1);
+ if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
+ IsRepeatedOpOrOneUseConstant(Op0, Op1))
+ return TruncateArithmetic(Op0, Op1);
+ break;
+ }
+
+ case ISD::MUL:
+ // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
+ // better to truncate if we have the chance.
+ if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
+ !TLI.isOperationLegal(Opcode, SrcVT))
+ return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
+ LLVM_FALLTHROUGH;
+ case ISD::ADD: {
+ SDValue Op0 = Src.getOperand(0);
+ SDValue Op1 = Src.getOperand(1);
+ if (TLI.isOperationLegal(Opcode, VT) &&
+ IsRepeatedOpOrOneUseConstant(Op0, Op1))
+ return TruncateArithmetic(Op0, Op1);
+ break;
+ }
+ }
+
+ return SDValue();
+}
+
/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
static SDValue
combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
@@ -31970,6 +32096,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
SDValue Src = N->getOperand(0);
SDLoc DL(N);
+ // Attempt to pre-truncate inputs to arithmetic ops instead.
+ if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
+ return V;
+
// Try to detect AVG pattern first.
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
return Avg;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index da7437ea0ccb..908053e1342d 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -650,33 +650,6 @@ multiclass vextract_for_size<int Opcode,
From.ZSuffix # "rrkz")
To.KRCWM:$mask, From.RC:$src1,
(EXTRACT_get_vextract_imm To.RC:$ext))>;
-
- // Intrinsic call with masking.
- def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
- "x" # To.NumElts # "_" # From.Size)
- From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask),
- (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
- From.ZSuffix # "rrk")
- To.RC:$src0,
- (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
- From.RC:$src1, imm:$idx)>;
-
- // Intrinsic call with zero-masking.
- def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
- "x" # To.NumElts # "_" # From.Size)
- From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask),
- (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
- From.ZSuffix # "rrkz")
- (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM),
- From.RC:$src1, imm:$idx)>;
-
- // Intrinsic call without masking.
- def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName #
- "x" # To.NumElts # "_" # From.Size)
- From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)),
- (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
- From.ZSuffix # "rr")
- From.RC:$src1, imm:$idx)>;
}
// Codegen pattern for the alternative types
@@ -6871,18 +6844,18 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
}
let isCodeGenOnly = 1 in {
- defm Int_VUCOMISSZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem,
- load, "ucomiss">, PS, EVEX, VEX_LIG,
+ defm Int_VUCOMISSZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss">, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
- defm Int_VUCOMISDZ : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem,
- load, "ucomisd">, PD, EVEX,
+ defm Int_VUCOMISDZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd">, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
- defm Int_VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem,
- load, "comiss">, PS, EVEX, VEX_LIG,
+ defm Int_VCOMISSZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss">, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
- defm Int_VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem,
- load, "comisd">, PD, EVEX,
+ defm Int_VCOMISDZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd">, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
}
}
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 9d6a89363044..4cd6ae563f03 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2373,6 +2373,23 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
Sched<[WriteFAddLd, ReadAfterLd]>;
}
+// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
+multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
+ ValueType vt, Operand memop,
+ ComplexPattern mem_cpat, string OpcodeStr> {
+ def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
+ IIC_SSE_COMIS_RR>,
+ Sched<[WriteFAdd]>;
+ def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+ [(set EFLAGS, (OpNode (vt RC:$src1),
+ mem_cpat:$src2))],
+ IIC_SSE_COMIS_RM>,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
+}
+
let Defs = [EFLAGS] in {
defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
"ucomiss">, PS, VEX, VEX_LIG;
@@ -2386,15 +2403,15 @@ let Defs = [EFLAGS] in {
}
let isCodeGenOnly = 1 in {
- defm Int_VUCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
- load, "ucomiss">, PS, VEX;
- defm Int_VUCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
- load, "ucomisd">, PD, VEX;
-
- defm Int_VCOMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
- load, "comiss">, PS, VEX;
- defm Int_VCOMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
- load, "comisd">, PD, VEX;
+ defm Int_VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss">, PS, VEX;
+ defm Int_VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd">, PD, VEX;
+
+ defm Int_VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss">, PS, VEX;
+ defm Int_VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd">, PD, VEX;
}
defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
"ucomiss">, PS;
@@ -2409,15 +2426,15 @@ let Defs = [EFLAGS] in {
}
let isCodeGenOnly = 1 in {
- defm Int_UCOMISS : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
- load, "ucomiss">, PS;
- defm Int_UCOMISD : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
- load, "ucomisd">, PD;
-
- defm Int_COMISS : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
- "comiss">, PS;
- defm Int_COMISD : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
- "comisd">, PD;
+ defm Int_UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
+ sse_load_f32, "ucomiss">, PS;
+ defm Int_UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
+ sse_load_f64, "ucomisd">, PD;
+
+ defm Int_COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
+ sse_load_f32, "comiss">, PS;
+ defm Int_COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
+ sse_load_f64, "comisd">, PD;
}
} // Defs = [EFLAGS]
diff --git a/lib/Target/X86/X86InstrTablesInfo.h b/lib/Target/X86/X86InstrTablesInfo.h
index 5d2af829028a..415a891bfd97 100755
--- a/lib/Target/X86/X86InstrTablesInfo.h
+++ b/lib/Target/X86/X86InstrTablesInfo.h
@@ -1,4 +1,4 @@
-//===-- X86AVX512Info.h - X86 Instruction Tables Information ----*- C++ -*-===//
+//===-- X86InstrTablesInfo.h - X86 Instruction Tables -----------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -25,8 +25,7 @@ struct X86EvexToVexCompressTableEntry {
// X86 EVEX encoded instructions that have a VEX 128 encoding
// (table format: <EVEX opcode, VEX-128 opcode>).
-static const X86EvexToVexCompressTableEntry
- X86EvexToVex128CompressTable[] = {
+static const X86EvexToVexCompressTableEntry X86EvexToVex128CompressTable[] = {
// EVEX scalar with corresponding VEX.
{ X86::Int_VCOMISDZrm , X86::Int_VCOMISDrm },
{ X86::Int_VCOMISDZrr , X86::Int_VCOMISDrr },
@@ -250,20 +249,20 @@ static const X86EvexToVexCompressTableEntry
{ X86::VUCOMISDZrr , X86::VUCOMISDrr },
{ X86::VUCOMISSZrm , X86::VUCOMISSrm },
{ X86::VUCOMISSZrr , X86::VUCOMISSrr },
-
+
{ X86::VMOV64toPQIZrr , X86::VMOV64toPQIrr },
{ X86::VMOV64toSDZrr , X86::VMOV64toSDrr },
{ X86::VMOVDI2PDIZrm , X86::VMOVDI2PDIrm },
{ X86::VMOVDI2PDIZrr , X86::VMOVDI2PDIrr },
{ X86::VMOVLHPSZrr , X86::VMOVLHPSrr },
- { X86::VMOVHLPSZrr , X86::VMOVHLPSrr },
+ { X86::VMOVHLPSZrr , X86::VMOVHLPSrr },
{ X86::VMOVPDI2DIZmr , X86::VMOVPDI2DImr },
{ X86::VMOVPDI2DIZrr , X86::VMOVPDI2DIrr },
{ X86::VMOVPQI2QIZmr , X86::VMOVPQI2QImr },
{ X86::VMOVPQIto64Zrr , X86::VMOVPQIto64rr },
{ X86::VMOVQI2PQIZrm , X86::VMOVQI2PQIrm },
{ X86::VMOVZPQILo2PQIZrr , X86::VMOVZPQILo2PQIrr },
-
+
{ X86::VPEXTRBZmr , X86::VPEXTRBmr },
{ X86::VPEXTRBZrr , X86::VPEXTRBrr },
{ X86::VPEXTRDZmr , X86::VPEXTRDmr },
@@ -272,7 +271,7 @@ static const X86EvexToVexCompressTableEntry
{ X86::VPEXTRQZrr , X86::VPEXTRQrr },
{ X86::VPEXTRWZmr , X86::VPEXTRWmr },
{ X86::VPEXTRWZrr , X86::VPEXTRWri },
-
+
{ X86::VPINSRBZrm , X86::VPINSRBrm },
{ X86::VPINSRBZrr , X86::VPINSRBrr },
{ X86::VPINSRDZrm , X86::VPINSRDrm },
@@ -294,7 +293,7 @@ static const X86EvexToVexCompressTableEntry
{ X86::VANDPDZ128rm , X86::VANDPDrm },
{ X86::VANDPDZ128rr , X86::VANDPDrr },
{ X86::VANDPSZ128rm , X86::VANDPSrm },
- { X86::VANDPSZ128rr , X86::VANDPSrr },
+ { X86::VANDPSZ128rr , X86::VANDPSrr },
{ X86::VBROADCASTSSZ128m , X86::VBROADCASTSSrm },
{ X86::VBROADCASTSSZ128r , X86::VBROADCASTSSrr },
{ X86::VBROADCASTSSZ128r_s , X86::VBROADCASTSSrr },
@@ -414,8 +413,8 @@ static const X86EvexToVexCompressTableEntry
{ X86::VMOVAPDZ128rm , X86::VMOVAPDrm },
{ X86::VMOVAPDZ128rr , X86::VMOVAPDrr },
{ X86::VMOVAPDZ128rr_REV , X86::VMOVAPDrr_REV },
- { X86::VMOVAPSZ128mr , X86::VMOVAPSmr },
- { X86::VMOVAPSZ128rm , X86::VMOVAPSrm },
+ { X86::VMOVAPSZ128mr , X86::VMOVAPSmr },
+ { X86::VMOVAPSZ128rm , X86::VMOVAPSrm },
{ X86::VMOVAPSZ128rr , X86::VMOVAPSrr },
{ X86::VMOVAPSZ128rr_REV , X86::VMOVAPSrr_REV },
{ X86::VMOVDDUPZ128rm , X86::VMOVDDUPrm },
@@ -464,8 +463,8 @@ static const X86EvexToVexCompressTableEntry
{ X86::VMOVUPDZ128rm , X86::VMOVUPDrm },
{ X86::VMOVUPDZ128rr , X86::VMOVUPDrr },
{ X86::VMOVUPDZ128rr_REV , X86::VMOVUPDrr_REV },
- { X86::VMOVUPSZ128mr , X86::VMOVUPSmr },
- { X86::VMOVUPSZ128rm , X86::VMOVUPSrm },
+ { X86::VMOVUPSZ128mr , X86::VMOVUPSmr },
+ { X86::VMOVUPSZ128rm , X86::VMOVUPSrm },
{ X86::VMOVUPSZ128rr , X86::VMOVUPSrr },
{ X86::VMOVUPSZ128rr_REV , X86::VMOVUPSrr_REV },
{ X86::VMULPDZ128rm , X86::VMULPDrm },
@@ -520,9 +519,9 @@ static const X86EvexToVexCompressTableEntry
{ X86::VPBROADCASTBZ128r , X86::VPBROADCASTBrr },
{ X86::VPBROADCASTDZ128m , X86::VPBROADCASTDrm },
{ X86::VPBROADCASTDZ128r , X86::VPBROADCASTDrr },
- { X86::VPBROADCASTQZ128m , X86::VPBROADCASTQrm },
- { X86::VPBROADCASTQZ128r , X86::VPBROADCASTQrr },
- { X86::VPBROADCASTWZ128m , X86::VPBROADCASTWrm },
+ { X86::VPBROADCASTQZ128m , X86::VPBROADCASTQrm },
+ { X86::VPBROADCASTQZ128r , X86::VPBROADCASTQrr },
+ { X86::VPBROADCASTWZ128m , X86::VPBROADCASTWrm },
{ X86::VPBROADCASTWZ128r , X86::VPBROADCASTWrr },
{ X86::VPERMILPDZ128mi , X86::VPERMILPDmi },
{ X86::VPERMILPDZ128ri , X86::VPERMILPDri },
@@ -583,7 +582,7 @@ static const X86EvexToVexCompressTableEntry
{ X86::VPMOVZXWDZ128rm , X86::VPMOVZXWDrm },
{ X86::VPMOVZXWDZ128rr , X86::VPMOVZXWDrr },
{ X86::VPMOVZXWQZ128rm , X86::VPMOVZXWQrm },
- { X86::VPMOVZXWQZ128rr , X86::VPMOVZXWQrr },
+ { X86::VPMOVZXWQZ128rr , X86::VPMOVZXWQrr },
{ X86::VPMULDQZ128rm , X86::VPMULDQrm },
{ X86::VPMULDQZ128rr , X86::VPMULDQrr },
{ X86::VPMULHRSWZ128rm , X86::VPMULHRSWrm },
@@ -612,10 +611,10 @@ static const X86EvexToVexCompressTableEntry
{ X86::VPSHUFHWZ128ri , X86::VPSHUFHWri },
{ X86::VPSHUFLWZ128mi , X86::VPSHUFLWmi },
{ X86::VPSHUFLWZ128ri , X86::VPSHUFLWri },
- { X86::VPSLLDQZ128rr , X86::VPSLLDQri },
+ { X86::VPSLLDQZ128rr , X86::VPSLLDQri },
{ X86::VPSLLDZ128ri , X86::VPSLLDri },
{ X86::VPSLLDZ128rm , X86::VPSLLDrm },
- { X86::VPSLLDZ128rr , X86::VPSLLDrr },
+ { X86::VPSLLDZ128rr , X86::VPSLLDrr },
{ X86::VPSLLQZ128ri , X86::VPSLLQri },
{ X86::VPSLLQZ128rm , X86::VPSLLQrm },
{ X86::VPSLLQZ128rr , X86::VPSLLQrr },
@@ -713,8 +712,7 @@ static const X86EvexToVexCompressTableEntry
// X86 EVEX encoded instructions that have a VEX 256 encoding
// (table format: <EVEX opcode, VEX-256 opcode>).
- static const X86EvexToVexCompressTableEntry
- X86EvexToVex256CompressTable[] = {
+ static const X86EvexToVexCompressTableEntry X86EvexToVex256CompressTable[] = {
{ X86::VADDPDZ256rm , X86::VADDPDYrm },
{ X86::VADDPDZ256rr , X86::VADDPDYrr },
{ X86::VADDPSZ256rm , X86::VADDPSYrm },
@@ -727,11 +725,11 @@ static const X86EvexToVexCompressTableEntry
{ X86::VANDPDZ256rr , X86::VANDPDYrr },
{ X86::VANDPSZ256rm , X86::VANDPSYrm },
{ X86::VANDPSZ256rr , X86::VANDPSYrr },
- { X86::VBROADCASTSDZ256m , X86::VBROADCASTSDYrm },
- { X86::VBROADCASTSDZ256r , X86::VBROADCASTSDYrr },
- { X86::VBROADCASTSDZ256r_s , X86::VBROADCASTSDYrr },
+ { X86::VBROADCASTSDZ256m , X86::VBROADCASTSDYrm },
+ { X86::VBROADCASTSDZ256r , X86::VBROADCASTSDYrr },
+ { X86::VBROADCASTSDZ256r_s , X86::VBROADCASTSDYrr },
{ X86::VBROADCASTSSZ256m , X86::VBROADCASTSSYrm },
- { X86::VBROADCASTSSZ256r , X86::VBROADCASTSSYrr },
+ { X86::VBROADCASTSSZ256r , X86::VBROADCASTSSYrr },
{ X86::VBROADCASTSSZ256r_s , X86::VBROADCASTSSYrr },
{ X86::VCVTDQ2PDZ256rm , X86::VCVTDQ2PDYrm },
{ X86::VCVTDQ2PDZ256rr , X86::VCVTDQ2PDYrr },
@@ -757,6 +755,14 @@ static const X86EvexToVexCompressTableEntry
{ X86::VDIVPDZ256rr , X86::VDIVPDYrr },
{ X86::VDIVPSZ256rm , X86::VDIVPSYrm },
{ X86::VDIVPSZ256rr , X86::VDIVPSYrr },
+ { X86::VEXTRACTF32x4Z256mr , X86::VEXTRACTF128mr },
+ { X86::VEXTRACTF64x2Z256mr , X86::VEXTRACTF128mr },
+ { X86::VEXTRACTF32x4Z256rr , X86::VEXTRACTF128rr },
+ { X86::VEXTRACTF64x2Z256rr , X86::VEXTRACTF128rr },
+ { X86::VEXTRACTI32x4Z256mr , X86::VEXTRACTI128mr },
+ { X86::VEXTRACTI64x2Z256mr , X86::VEXTRACTI128mr },
+ { X86::VEXTRACTI32x4Z256rr , X86::VEXTRACTI128rr },
+ { X86::VEXTRACTI64x2Z256rr , X86::VEXTRACTI128rr },
{ X86::VFMADD132PDZ256m , X86::VFMADD132PDYm },
{ X86::VFMADD132PDZ256r , X86::VFMADD132PDYr },
{ X86::VFMADD132PSZ256m , X86::VFMADD132PSYm },
@@ -829,6 +835,14 @@ static const X86EvexToVexCompressTableEntry
{ X86::VFNMSUB231PDZ256r , X86::VFNMSUB231PDYr },
{ X86::VFNMSUB231PSZ256m , X86::VFNMSUB231PSYm },
{ X86::VFNMSUB231PSZ256r , X86::VFNMSUB231PSYr },
+ { X86::VINSERTF32x4Z256rm , X86::VINSERTF128rm },
+ { X86::VINSERTF64x2Z256rm , X86::VINSERTF128rm },
+ { X86::VINSERTF32x4Z256rr , X86::VINSERTF128rr },
+ { X86::VINSERTF64x2Z256rr , X86::VINSERTF128rr },
+ { X86::VINSERTI32x4Z256rm , X86::VINSERTI128rm },
+ { X86::VINSERTI64x2Z256rm , X86::VINSERTI128rm },
+ { X86::VINSERTI32x4Z256rr , X86::VINSERTI128rr },
+ { X86::VINSERTI64x2Z256rr , X86::VINSERTI128rr },
{ X86::VMAXCPDZ256rm , X86::VMAXCPDYrm },
{ X86::VMAXCPDZ256rr , X86::VMAXCPDYrr },
{ X86::VMAXCPSZ256rm , X86::VMAXCPSYrm },
@@ -849,8 +863,8 @@ static const X86EvexToVexCompressTableEntry
{ X86::VMOVAPDZ256rm , X86::VMOVAPDYrm },
{ X86::VMOVAPDZ256rr , X86::VMOVAPDYrr },
{ X86::VMOVAPDZ256rr_REV , X86::VMOVAPDYrr_REV },
- { X86::VMOVAPSZ256mr , X86::VMOVAPSYmr },
- { X86::VMOVAPSZ256rm , X86::VMOVAPSYrm },
+ { X86::VMOVAPSZ256mr , X86::VMOVAPSYmr },
+ { X86::VMOVAPSZ256rm , X86::VMOVAPSYrm },
{ X86::VMOVAPSZ256rr , X86::VMOVAPSYrr },
{ X86::VMOVAPSZ256rr_REV , X86::VMOVAPSYrr_REV },
{ X86::VMOVDDUPZ256rm , X86::VMOVDDUPYrm },
@@ -943,14 +957,14 @@ static const X86EvexToVexCompressTableEntry
{ X86::VPAVGBZ256rr , X86::VPAVGBYrr },
{ X86::VPAVGWZ256rm , X86::VPAVGWYrm },
{ X86::VPAVGWZ256rr , X86::VPAVGWYrr },
- { X86::VPBROADCASTBZ256m , X86::VPBROADCASTBYrm },
- { X86::VPBROADCASTBZ256r , X86::VPBROADCASTBYrr },
- { X86::VPBROADCASTDZ256m , X86::VPBROADCASTDYrm },
- { X86::VPBROADCASTDZ256r , X86::VPBROADCASTDYrr },
- { X86::VPBROADCASTQZ256m , X86::VPBROADCASTQYrm },
- { X86::VPBROADCASTQZ256r , X86::VPBROADCASTQYrr },
- { X86::VPBROADCASTWZ256m , X86::VPBROADCASTWYrm },
- { X86::VPBROADCASTWZ256r , X86::VPBROADCASTWYrr },
+ { X86::VPBROADCASTBZ256m , X86::VPBROADCASTBYrm },
+ { X86::VPBROADCASTBZ256r , X86::VPBROADCASTBYrr },
+ { X86::VPBROADCASTDZ256m , X86::VPBROADCASTDYrm },
+ { X86::VPBROADCASTDZ256r , X86::VPBROADCASTDYrr },
+ { X86::VPBROADCASTQZ256m , X86::VPBROADCASTQYrm },
+ { X86::VPBROADCASTQZ256r , X86::VPBROADCASTQYrr },
+ { X86::VPBROADCASTWZ256m , X86::VPBROADCASTWYrm },
+ { X86::VPBROADCASTWZ256r , X86::VPBROADCASTWYrr },
{ X86::VPERMDZ256rm , X86::VPERMDYrm },
{ X86::VPERMDZ256rr , X86::VPERMDYrr },
{ X86::VPERMILPDZ256mi , X86::VPERMILPDYmi },
@@ -1050,7 +1064,7 @@ static const X86EvexToVexCompressTableEntry
{ X86::VPSLLDQZ256rr , X86::VPSLLDQYri },
{ X86::VPSLLDZ256ri , X86::VPSLLDYri },
{ X86::VPSLLDZ256rm , X86::VPSLLDYrm },
- { X86::VPSLLDZ256rr , X86::VPSLLDYrr },
+ { X86::VPSLLDZ256rr , X86::VPSLLDYrr },
{ X86::VPSLLQZ256ri , X86::VPSLLQYri },
{ X86::VPSLLQZ256rm , X86::VPSLLQYrm },
{ X86::VPSLLQZ256rr , X86::VPSLLQYrr },
@@ -1060,7 +1074,7 @@ static const X86EvexToVexCompressTableEntry
{ X86::VPSLLVQZ256rr , X86::VPSLLVQYrr },
{ X86::VPSLLWZ256ri , X86::VPSLLWYri },
{ X86::VPSLLWZ256rm , X86::VPSLLWYrm },
- { X86::VPSLLWZ256rr , X86::VPSLLWYrr },
+ { X86::VPSLLWZ256rr , X86::VPSLLWYrr },
{ X86::VPSRADZ256ri , X86::VPSRADYri },
{ X86::VPSRADZ256rm , X86::VPSRADYrm },
{ X86::VPSRADZ256rr , X86::VPSRADYrr },
@@ -1072,7 +1086,7 @@ static const X86EvexToVexCompressTableEntry
{ X86::VPSRLDQZ256rr , X86::VPSRLDQYri },
{ X86::VPSRLDZ256ri , X86::VPSRLDYri },
{ X86::VPSRLDZ256rm , X86::VPSRLDYrm },
- { X86::VPSRLDZ256rr , X86::VPSRLDYrr },
+ { X86::VPSRLDZ256rr , X86::VPSRLDYrr },
{ X86::VPSRLQZ256ri , X86::VPSRLQYri },
{ X86::VPSRLQZ256rm , X86::VPSRLQYrm },
{ X86::VPSRLQZ256rr , X86::VPSRLQYrr },
@@ -1145,4 +1159,4 @@ static const X86EvexToVexCompressTableEntry
{ X86::VXORPSZ256rr , X86::VXORPSYrr },
};
-#endif \ No newline at end of file
+#endif
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index df47b4ad583d..63a02af02faa 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -34,7 +34,7 @@ enum IntrinsicType : uint16_t {
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, BRCST32x2_TO_VEC,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
- EXPAND_FROM_MEM, INSERT_SUBVEC,
+ EXPAND_FROM_MEM,
TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
};
@@ -795,30 +795,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VGETMANTS, 0),
X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
X86ISD::VGETMANTS, 0),
- X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
- X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC,
- ISD::INSERT_SUBVECTOR, 0),
X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK,
ISD::CTLZ, 0),
X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK,
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 2f69df064e7f..a38a4b30b77d 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -1115,56 +1115,6 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLo
OutStreamer->EmitInstruction(TC, getSubtargetInfo());
}
-void X86AsmPrinter::EmitXRayTable() {
- if (Sleds.empty())
- return;
-
- auto PrevSection = OutStreamer->getCurrentSectionOnly();
- auto Fn = MF->getFunction();
- MCSection *Section = nullptr;
- if (Subtarget->isTargetELF()) {
- if (Fn->hasComdat()) {
- Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
- ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
- Fn->getComdat()->getName());
- } else {
- Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
- ELF::SHF_ALLOC);
- }
- } else if (Subtarget->isTargetMachO()) {
- Section = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
- SectionKind::getReadOnlyWithRel());
- } else {
- llvm_unreachable("Unsupported target");
- }
-
- // Before we switch over, we force a reference to a label inside the
- // xray_instr_map section. Since EmitXRayTable() is always called just
- // before the function's end, we assume that this is happening after the
- // last return instruction.
- //
- // We then align the reference to 16 byte boundaries, which we determined
- // experimentally to be beneficial to avoid causing decoder stalls.
- MCSymbol *Tmp = OutContext.createTempSymbol("xray_synthetic_", true);
- OutStreamer->EmitCodeAlignment(16);
- OutStreamer->EmitSymbolValue(Tmp, 8, false);
- OutStreamer->SwitchSection(Section);
- OutStreamer->EmitLabel(Tmp);
- for (const auto &Sled : Sleds) {
- OutStreamer->EmitSymbolValue(Sled.Sled, 8);
- OutStreamer->EmitSymbolValue(CurrentFnSym, 8);
- auto Kind = static_cast<uint8_t>(Sled.Kind);
- OutStreamer->EmitBytes(
- StringRef(reinterpret_cast<const char *>(&Kind), 1));
- OutStreamer->EmitBytes(
- StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
- OutStreamer->EmitZeros(14);
- }
- OutStreamer->SwitchSection(PrevSection);
-
- Sleds.clear();
-}
-
// Returns instruction preceding MBBI in MachineFunction.
// If MBBI is the first instruction of the first basic block, returns null.
static MachineBasicBlock::const_iterator
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 2b0e672d56f2..d7792e296a58 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -598,198 +598,136 @@ int X86TTIImpl::getArithmeticInstrCost(
int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
-
- if (Kind == TTI::SK_Reverse) {
+ if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate) {
+ // 64-bit packed float vectors (v2f32) are widened to type v4f32.
+ // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
static const CostTblEntry AVX512VBMIShuffleTbl[] = {
- { ISD::VECTOR_SHUFFLE, MVT::v64i8, 1 }, // vpermb
- { ISD::VECTOR_SHUFFLE, MVT::v32i8, 1 } // vpermb
+ { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb
+ { TTI::SK_Reverse, MVT::v32i8, 1 } // vpermb
};
if (ST->hasVBMI())
- if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
+ if (const auto *Entry =
+ CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX512BWShuffleTbl[] = {
- { ISD::VECTOR_SHUFFLE, MVT::v32i16, 1 }, // vpermw
- { ISD::VECTOR_SHUFFLE, MVT::v16i16, 1 }, // vpermw
- { ISD::VECTOR_SHUFFLE, MVT::v64i8, 6 } // vextracti64x4 + 2*vperm2i128
- // + 2*pshufb + vinserti64x4
+ { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
+ { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
+ { TTI::SK_Reverse, MVT::v64i8, 6 } // vextracti64x4 + 2*vperm2i128
+ // + 2*pshufb + vinserti64x4
};
if (ST->hasBWI())
- if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
+ if (const auto *Entry =
+ CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX512ShuffleTbl[] = {
- { ISD::VECTOR_SHUFFLE, MVT::v8f64, 1 }, // vpermpd
- { ISD::VECTOR_SHUFFLE, MVT::v16f32, 1 }, // vpermps
- { ISD::VECTOR_SHUFFLE, MVT::v8i64, 1 }, // vpermq
- { ISD::VECTOR_SHUFFLE, MVT::v16i32, 1 }, // vpermd
+ { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd
+ { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps
+ { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq
+ { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd
};
if (ST->hasAVX512())
if (const auto *Entry =
- CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX2ShuffleTbl[] = {
- { ISD::VECTOR_SHUFFLE, MVT::v4f64, 1 }, // vpermpd
- { ISD::VECTOR_SHUFFLE, MVT::v8f32, 1 }, // vpermps
- { ISD::VECTOR_SHUFFLE, MVT::v4i64, 1 }, // vpermq
- { ISD::VECTOR_SHUFFLE, MVT::v8i32, 1 }, // vpermd
- { ISD::VECTOR_SHUFFLE, MVT::v16i16, 2 }, // vperm2i128 + pshufb
- { ISD::VECTOR_SHUFFLE, MVT::v32i8, 2 } // vperm2i128 + pshufb
+ { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd
+ { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps
+ { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq
+ { TTI::SK_Reverse, MVT::v8i32, 1 }, // vpermd
+ { TTI::SK_Reverse, MVT::v16i16, 2 }, // vperm2i128 + pshufb
+ { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb
+
+ { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
+ { TTI::SK_Alternate, MVT::v32i8, 1 } // vpblendvb
};
if (ST->hasAVX2())
- if (const auto *Entry =
- CostTableLookup(AVX2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX1ShuffleTbl[] = {
- { ISD::VECTOR_SHUFFLE, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
- { ISD::VECTOR_SHUFFLE, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
- { ISD::VECTOR_SHUFFLE, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
- { ISD::VECTOR_SHUFFLE, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
- { ISD::VECTOR_SHUFFLE, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
- // + vinsertf128
- { ISD::VECTOR_SHUFFLE, MVT::v32i8, 4 } // vextractf128 + 2*pshufb
- // + vinsertf128
+ { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
+ { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
+ { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd
+ { TTI::SK_Reverse, MVT::v8i32, 2 }, // vperm2f128 + vpermilps
+ { TTI::SK_Reverse, MVT::v16i16, 4 }, // vextractf128 + 2*pshufb
+ // + vinsertf128
+ { TTI::SK_Reverse, MVT::v32i8, 4 }, // vextractf128 + 2*pshufb
+ // + vinsertf128
+
+ { TTI::SK_Alternate, MVT::v4i64, 1 }, // vblendpd
+ { TTI::SK_Alternate, MVT::v4f64, 1 }, // vblendpd
+ { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps
+ { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps
+ { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
+ { TTI::SK_Alternate, MVT::v32i8, 3 } // vpand + vpandn + vpor
};
if (ST->hasAVX())
- if (const auto *Entry =
- CostTableLookup(AVX1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry SSE41ShuffleTbl[] = {
+ { TTI::SK_Alternate, MVT::v2i64, 1 }, // pblendw
+ { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
+ { TTI::SK_Alternate, MVT::v4i32, 1 }, // pblendw
+ { TTI::SK_Alternate, MVT::v4f32, 1 }, // blendps
+ { TTI::SK_Alternate, MVT::v8i16, 1 }, // pblendw
+ { TTI::SK_Alternate, MVT::v16i8, 1 } // pblendvb
+ };
+
+ if (ST->hasSSE41())
+ if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry SSSE3ShuffleTbl[] = {
- { ISD::VECTOR_SHUFFLE, MVT::v8i16, 1 }, // pshufb
- { ISD::VECTOR_SHUFFLE, MVT::v16i8, 1 } // pshufb
+ { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
+ { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
+
+ { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por
+ { TTI::SK_Alternate, MVT::v16i8, 3 } // pshufb + pshufb + por
};
if (ST->hasSSSE3())
- if (const auto *Entry =
- CostTableLookup(SSSE3ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry SSE2ShuffleTbl[] = {
- { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, // shufpd
- { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, // pshufd
- { ISD::VECTOR_SHUFFLE, MVT::v4i32, 1 }, // pshufd
- { ISD::VECTOR_SHUFFLE, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
- { ISD::VECTOR_SHUFFLE, MVT::v16i8, 9 } // 2*pshuflw + 2*pshufhw
- // + 2*pshufd + 2*unpck + packus
+ { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
+ { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
+ { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
+ { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
+ { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
+ // + 2*pshufd + 2*unpck + packus
+
+ { TTI::SK_Alternate, MVT::v2i64, 1 }, // movsd
+ { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd
+ { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps
+ { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
+ { TTI::SK_Alternate, MVT::v16i8, 3 } // pand + pandn + por
};
if (ST->hasSSE2())
- if (const auto *Entry =
- CostTableLookup(SSE2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry SSE1ShuffleTbl[] = {
- { ISD::VECTOR_SHUFFLE, MVT::v4f32, 1 }, // shufps
+ { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
+ { TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps
};
if (ST->hasSSE1())
- if (const auto *Entry =
- CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+ if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
- } else if (Kind == TTI::SK_Alternate) {
- // 64-bit packed float vectors (v2f32) are widened to type v4f32.
- // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
- std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
-
- // The backend knows how to generate a single VEX.256 version of
- // instruction VPBLENDW if the target supports AVX2.
- if (ST->hasAVX2() && LT.second == MVT::v16i16)
- return LT.first;
-
- static const CostTblEntry AVXAltShuffleTbl[] = {
- {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd
- {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd
-
- {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1}, // vblendps
- {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1}, // vblendps
-
- // This shuffle is custom lowered into a sequence of:
- // 2x vextractf128 , 2x vpblendw , 1x vinsertf128
- {ISD::VECTOR_SHUFFLE, MVT::v16i16, 5},
-
- // This shuffle is custom lowered into a long sequence of:
- // 2x vextractf128 , 4x vpshufb , 2x vpor , 1x vinsertf128
- {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9}
- };
-
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
- return LT.first * Entry->Cost;
-
- static const CostTblEntry SSE41AltShuffleTbl[] = {
- // These are lowered into movsd.
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
-
- // packed float vectors with four elements are lowered into BLENDI dag
- // nodes. A v4i32/v4f32 BLENDI generates a single 'blendps'/'blendpd'.
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
-
- // This shuffle generates a single pshufw.
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
-
- // There is no instruction that matches a v16i8 alternate shuffle.
- // The backend will expand it into the sequence 'pshufb + pshufb + or'.
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}
- };
-
- if (ST->hasSSE41())
- if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE,
- LT.second))
- return LT.first * Entry->Cost;
-
- static const CostTblEntry SSSE3AltShuffleTbl[] = {
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
-
- // SSE3 doesn't have 'blendps'. The following shuffles are expanded into
- // the sequence 'shufps + pshufd'
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
-
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 3}, // pshufb + pshufb + or
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or
- };
-
- if (ST->hasSSSE3())
- if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
- return LT.first * Entry->Cost;
-
- static const CostTblEntry SSEAltShuffleTbl[] = {
- {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd
- {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd
-
- {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2}, // shufps + pshufd
- {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2}, // shufps + pshufd
-
- // This is expanded into a long sequence of four extract + four insert.
- {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8}, // 4 x pextrw + 4 pinsrw.
-
- // 8 x (pinsrw + pextrw + and + movb + movzb + or)
- {ISD::VECTOR_SHUFFLE, MVT::v16i8, 48}
- };
-
- // Fall-back (SSE3 and SSE2).
- if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl,
- ISD::VECTOR_SHUFFLE, LT.second))
- return LT.first * Entry->Cost;
-
} else if (Kind == TTI::SK_PermuteTwoSrc) {
// We assume that source and destination have the same vector type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 3bbc70ab21c6..55151c13b430 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1057,6 +1057,18 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
// add(zext(xor i16 X, -32768), -32768) --> sext X
return CastInst::Create(Instruction::SExt, X, LHS->getType());
}
+
+ if (Val->isNegative() &&
+ match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) &&
+ Val->sge(-C->sext(Val->getBitWidth()))) {
+ // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val))
+ return CastInst::Create(
+ Instruction::ZExt,
+ Builder->CreateNUWAdd(
+ X, Constant::getIntegerValue(X->getType(),
+ *C + Val->trunc(C->getBitWidth()))),
+ I.getType());
+ }
}
// FIXME: Use the match above instead of dyn_cast to allow these transforms
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 92369bd70b13..f863d192fc2f 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1581,6 +1581,62 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return replaceInstUsesWith(*II, V);
break;
}
+ case Intrinsic::fma:
+ case Intrinsic::fmuladd: {
+ Value *Src0 = II->getArgOperand(0);
+ Value *Src1 = II->getArgOperand(1);
+
+ // Canonicalize constants into the RHS.
+ if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+ II->setArgOperand(0, Src1);
+ II->setArgOperand(1, Src0);
+ std::swap(Src0, Src1);
+ }
+
+ Value *LHS = nullptr;
+ Value *RHS = nullptr;
+
+ // fma fneg(x), fneg(y), z -> fma x, y, z
+ if (match(Src0, m_FNeg(m_Value(LHS))) &&
+ match(Src1, m_FNeg(m_Value(RHS)))) {
+ CallInst *NewCall = Builder->CreateCall(II->getCalledFunction(),
+ {LHS, RHS, II->getArgOperand(2)});
+ NewCall->takeName(II);
+ NewCall->copyFastMathFlags(II);
+ return replaceInstUsesWith(*II, NewCall);
+ }
+
+ // fma fabs(x), fabs(x), z -> fma x, x, z
+ if (match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) &&
+ match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS))) && LHS == RHS) {
+ CallInst *NewCall = Builder->CreateCall(II->getCalledFunction(),
+ {LHS, LHS, II->getArgOperand(2)});
+ NewCall->takeName(II);
+ NewCall->copyFastMathFlags(II);
+ return replaceInstUsesWith(*II, NewCall);
+ }
+
+ // fma x, 1, z -> fadd x, z
+ if (match(Src1, m_FPOne())) {
+ Instruction *RI = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
+ RI->copyFastMathFlags(II);
+ return RI;
+ }
+
+ break;
+ }
+ case Intrinsic::fabs: {
+ Value *Cond;
+ Constant *LHS, *RHS;
+ if (match(II->getArgOperand(0),
+ m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) {
+ CallInst *Call0 = Builder->CreateCall(II->getCalledFunction(), {LHS});
+ CallInst *Call1 = Builder->CreateCall(II->getCalledFunction(), {RHS});
+ return SelectInst::Create(Cond, Call0, Call1);
+ }
+
+ break;
+ }
case Intrinsic::ppc_altivec_lvx:
case Intrinsic::ppc_altivec_lvxl:
// Turn PPC lvx -> load if the pointer is known aligned.
@@ -2669,24 +2725,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// assume( (load addr) != null ) -> add 'nonnull' metadata to load
// (if assume is valid at the load)
- if (ICmpInst* ICmp = dyn_cast<ICmpInst>(IIOperand)) {
- Value *LHS = ICmp->getOperand(0);
- Value *RHS = ICmp->getOperand(1);
- if (ICmpInst::ICMP_NE == ICmp->getPredicate() &&
- isa<LoadInst>(LHS) &&
- isa<Constant>(RHS) &&
- RHS->getType()->isPointerTy() &&
- cast<Constant>(RHS)->isNullValue()) {
- LoadInst* LI = cast<LoadInst>(LHS);
- if (isValidAssumeForContext(II, LI, &DT)) {
- MDNode *MD = MDNode::get(II->getContext(), None);
- LI->setMetadata(LLVMContext::MD_nonnull, MD);
- return eraseInstFromFunction(*II);
- }
- }
+ CmpInst::Predicate Pred;
+ Instruction *LHS;
+ if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
+ Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load &&
+ LHS->getType()->isPointerTy() &&
+ isValidAssumeForContext(II, LHS, &DT)) {
+ MDNode *MD = MDNode::get(II->getContext(), None);
+ LHS->setMetadata(LLVMContext::MD_nonnull, MD);
+ return eraseInstFromFunction(*II);
+
// TODO: apply nonnull return attributes to calls and invokes
// TODO: apply range metadata for range check patterns?
}
+
// If there is a dominating assume with the same condition as this one,
// then this one is redundant, and should be removed.
APInt KnownZero(1, 0), KnownOne(1, 0);
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 5276bee4e0a2..388c5e4e7fa4 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -850,20 +850,10 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
// separated by a few arithmetic operations.
BasicBlock::iterator BBI(LI);
bool IsLoadCSE = false;
- if (Value *AvailableVal =
- FindAvailableLoadedValue(&LI, LI.getParent(), BBI,
- DefMaxInstsToScan, AA, &IsLoadCSE)) {
- if (IsLoadCSE) {
- LoadInst *NLI = cast<LoadInst>(AvailableVal);
- unsigned KnownIDs[] = {
- LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
- LLVMContext::MD_noalias, LLVMContext::MD_range,
- LLVMContext::MD_invariant_load, LLVMContext::MD_nonnull,
- LLVMContext::MD_invariant_group, LLVMContext::MD_align,
- LLVMContext::MD_dereferenceable,
- LLVMContext::MD_dereferenceable_or_null};
- combineMetadata(NLI, &LI, KnownIDs);
- };
+ if (Value *AvailableVal = FindAvailableLoadedValue(
+ &LI, LI.getParent(), BBI, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+ if (IsLoadCSE)
+ combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI);
return replaceInstUsesWith(
LI, Builder->CreateBitOrPointerCast(AvailableVal, LI.getType(),
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index bc38c4aca348..5ad2a1c0e3e6 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -731,6 +731,25 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
if (ConstantInt *Op1C = dyn_cast<ConstantInt>(I.getOperand(1))) {
unsigned ShAmt = Op1C->getZExtValue();
+ // Turn:
+ // %zext = zext i32 %V to i64
+ // %res = shl i64 %V, 8
+ //
+ // Into:
+ // %shl = shl i32 %V, 8
+ // %res = zext i32 %shl to i64
+ //
+ // This is only valid if %V would have zeros shifted out.
+ if (auto *ZI = dyn_cast<ZExtInst>(I.getOperand(0))) {
+ unsigned SrcBitWidth = ZI->getSrcTy()->getScalarSizeInBits();
+ if (ShAmt < SrcBitWidth &&
+ MaskedValueIsZero(ZI->getOperand(0),
+ APInt::getHighBitsSet(SrcBitWidth, ShAmt), 0, &I)) {
+ auto *Shl = Builder->CreateShl(ZI->getOperand(0), ShAmt);
+ return new ZExtInst(Shl, I.getType());
+ }
+ }
+
// If the shifted-out value is known-zero, then this is a NUW shift.
if (!I.hasNoUnsignedWrap() &&
MaskedValueIsZero(I.getOperand(0),
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 9bf638dcbae3..16e08ee58fbe 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -481,9 +481,9 @@ private:
bool processNode(DomTreeNode *Node);
Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
- if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+ if (auto *LI = dyn_cast<LoadInst>(Inst))
return LI;
- else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ if (auto *SI = dyn_cast<StoreInst>(Inst))
return SI->getValueOperand();
assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index dee61b77412e..8b8236390bf4 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -79,6 +79,7 @@ STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
STATISTIC(NumGVNBlocksDeleted, "Number of blocks deleted");
STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified");
STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same");
+STATISTIC(NumGVNMaxIterations, "Maximum Number of iterations it took to converge GVN");
//===----------------------------------------------------------------------===//
// GVN Pass
@@ -714,16 +715,15 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
// Unlike loads, we never try to eliminate stores, so we do not check if they
// are simple and avoid value numbering them.
auto *SI = cast<StoreInst>(I);
- // If this store's memorydef stores the same value as the last store, the
- // memory accesses are equivalent.
- // Get the expression, if any, for the RHS of the MemoryDef.
MemoryAccess *StoreAccess = MSSA->getMemoryAccess(SI);
- MemoryAccess *StoreRHS = lookupMemoryAccessEquiv(
- cast<MemoryDef>(StoreAccess)->getDefiningAccess());
- const Expression *OldStore = createStoreExpression(SI, StoreRHS, B);
- // See if this store expression already has a value, and it's the same as our
- // current store. FIXME: Right now, we only do this for simple stores.
+ // See if we are defined by a previous store expression, it already has a
+ // value, and it's the same value as our current store. FIXME: Right now, we
+ // only do this for simple stores, we should expand to cover memcpys, etc.
if (SI->isSimple()) {
+ // Get the expression, if any, for the RHS of the MemoryDef.
+ MemoryAccess *StoreRHS = lookupMemoryAccessEquiv(
+ cast<MemoryDef>(StoreAccess)->getDefiningAccess());
+ const Expression *OldStore = createStoreExpression(SI, StoreRHS, B);
CongruenceClass *CC = ExpressionToClass.lookup(OldStore);
if (CC && CC->DefiningExpr && isa<StoreExpression>(CC->DefiningExpr) &&
CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B))
@@ -1092,23 +1092,16 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
if (auto *I = dyn_cast<Instruction>(V)) {
if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) {
// If this is a MemoryDef, we need to update the equivalence table. If
- // we
- // determined the expression is congruent to a different memory state,
- // use that different memory state. If we determined it didn't, we
- // update
- // that as well. Note that currently, we do not guarantee the
- // "different" memory state dominates us. The goal is to make things
- // that are congruent look congruent, not ensure we can eliminate one in
- // favor of the other.
- // Right now, the only way they can be equivalent is for store
- // expresions.
- if (!isa<MemoryUse>(MA)) {
- if (E && isa<StoreExpression>(E) && EClass->Members.size() != 1) {
- auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess();
- setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr);
- } else {
- setMemoryAccessEquivTo(MA, nullptr);
- }
+ // we determined the expression is congruent to a different memory
+ // state, use that different memory state. If we determined it didn't,
+ // we update that as well. Right now, we only support store
+ // expressions.
+ if (!isa<MemoryUse>(MA) && isa<StoreExpression>(E) &&
+ EClass->Members.size() != 1) {
+ auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess();
+ setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr);
+ } else {
+ setMemoryAccessEquivTo(MA, nullptr);
}
markMemoryUsersTouched(MA);
}
@@ -1391,7 +1384,7 @@ void NewGVN::valueNumberInstruction(Instruction *I) {
} else {
// Handle terminators that return values. All of them produce values we
// don't currently understand.
- if (!I->getType()->isVoidTy()){
+ if (!I->getType()->isVoidTy()) {
auto *Symbolized = createUnknownExpression(I);
performCongruenceFinding(I, Symbolized);
}
@@ -1427,14 +1420,12 @@ void NewGVN::verifyMemoryCongruency() {
continue;
if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second);
- if (FirstMUD && SecondMUD) {
- auto *FirstInst = FirstMUD->getMemoryInst();
- auto *SecondInst = SecondMUD->getMemoryInst();
+ if (FirstMUD && SecondMUD)
assert(
- ValueToClass.lookup(FirstInst) == ValueToClass.lookup(SecondInst) &&
+ ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
+ ValueToClass.lookup(SecondMUD->getMemoryInst()) &&
"The instructions for these memory operations should have been in "
"the same congruence class");
- }
} else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
// We can only sanely verify that MemoryDefs in the operand list all have
@@ -1538,9 +1529,11 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
initializeCongruenceClasses(F);
+ unsigned int Iterations = 0;
// We start out in the entry block.
BasicBlock *LastBlock = &F.getEntryBlock();
while (TouchedInstructions.any()) {
+ ++Iterations;
// Walk through all the instructions in all the blocks in RPO.
for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1;
InstrNum = TouchedInstructions.find_next(InstrNum)) {
@@ -1587,8 +1580,7 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
TouchedInstructions.reset(InstrNum);
}
}
-
-// FIXME: Move this to expensive checks when we are satisfied with NewGVN
+ NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
#ifndef NDEBUG
verifyMemoryCongruency();
#endif
@@ -2070,7 +2062,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
// Cleanup the congruence class.
SmallPtrSet<Value *, 4> MembersLeft;
- for (Value * Member : CC->Members) {
+ for (Value *Member : CC->Members) {
if (Member->getType()->isVoidTy()) {
MembersLeft.insert(Member);
continue;
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index a2ceded106b4..a40079ca8e76 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -760,7 +760,7 @@ static void PropagateParallelLoopAccessMetadata(CallSite CS,
/// When inlining a function that contains noalias scope metadata,
/// this metadata needs to be cloned so that the inlined blocks
-/// have different "unqiue scopes" at every call site. Were this not done, then
+/// have different "unique scopes" at every call site. Were this not done, then
/// aliasing scopes from a function inlined into a caller multiple times could
/// not be differentiated (and this would lead to miscompiles because the
/// non-aliasing property communicated by the metadata could have
diff --git a/lib/Transforms/Utils/LoopUnrollPeel.cpp b/lib/Transforms/Utils/LoopUnrollPeel.cpp
index dc526a20c903..842cf31f2e3d 100644
--- a/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -335,10 +335,12 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1);
uint64_t TrueWeight, FalseWeight;
- uint64_t ExitWeight = 0, BackEdgeWeight = 0;
+ uint64_t ExitWeight = 0, CurHeaderWeight = 0;
if (LatchBR->extractProfMetadata(TrueWeight, FalseWeight)) {
ExitWeight = HeaderIdx ? TrueWeight : FalseWeight;
- BackEdgeWeight = HeaderIdx ? FalseWeight : TrueWeight;
+ // The # of times the loop body executes is the sum of the exit block
+ // weight and the # of times the backedges are taken.
+ CurHeaderWeight = TrueWeight + FalseWeight;
}
// For each peeled-off iteration, make a copy of the loop.
@@ -346,15 +348,14 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
SmallVector<BasicBlock *, 8> NewBlocks;
ValueToValueMapTy VMap;
- // The exit weight of the previous iteration is the header entry weight
- // of the current iteration. So this is exactly how many dynamic iterations
- // the current peeled-off static iteration uses up.
+ // Subtract the exit weight from the current header weight -- the exit
+ // weight is exactly the weight of the previous iteration's header.
// FIXME: due to the way the distribution is constructed, we need a
// guard here to make sure we don't end up with non-positive weights.
- if (ExitWeight < BackEdgeWeight)
- BackEdgeWeight -= ExitWeight;
+ if (ExitWeight < CurHeaderWeight)
+ CurHeaderWeight -= ExitWeight;
else
- BackEdgeWeight = 1;
+ CurHeaderWeight = 1;
cloneLoopBlocks(L, Iter, InsertTop, InsertBot, Exit,
NewBlocks, LoopBlocks, VMap, LVMap, LI);
@@ -388,6 +389,14 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
// Adjust the branch weights on the loop exit.
if (ExitWeight) {
+ // The backedge count is the difference of current header weight and
+ // current loop exit weight. If the current header weight is smaller than
+ // the current loop exit weight, we mark the loop backedge weight as 1.
+ uint64_t BackEdgeWeight = 0;
+ if (ExitWeight < CurHeaderWeight)
+ BackEdgeWeight = CurHeaderWeight - ExitWeight;
+ else
+ BackEdgeWeight = 1;
MDBuilder MDB(LatchBR->getContext());
MDNode *WeightNode =
HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 3846b21c502e..54390e77bb1f 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1574,12 +1574,20 @@ static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
I0->getOperandUse(O).set(NewOperands[O]);
I0->moveBefore(&*BBEnd->getFirstInsertionPt());
- // Update metadata and IR flags.
+ // The debug location for the "common" instruction is the merged locations of
+ // all the commoned instructions. We start with the original location of the
+ // "common" instruction and iteratively merge each location in the loop below.
+ DILocation *Loc = I0->getDebugLoc();
+
+ // Update metadata and IR flags, and merge debug locations.
for (auto *I : Insts)
if (I != I0) {
+ Loc = DILocation::getMergedLocation(Loc, I->getDebugLoc());
combineMetadataForCSE(I0, I);
I0->andIRFlags(I);
}
+ if (!isa<CallInst>(I0))
+ I0->setDebugLoc(Loc);
if (!isa<StoreInst>(I0)) {
// canSinkLastInstruction checked that all instructions were used by
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index b1a47b55cfcd..bf802a3b4ea8 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -73,7 +73,13 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
# Setting a variable to let sub-projects detect which other projects
# will be included under here.
set(HAVE_${canon_name} On)
+ endforeach()
+ # We do this in two loops so that HAVE_* is set for each runtime before the
+ # other runtimes are added.
+ foreach(entry ${runtimes})
+ get_filename_component(projName ${entry} NAME)
+
# Between each sub-project we want to cache and clear the LIT properties
set_property(GLOBAL PROPERTY LLVM_LIT_TESTSUITES)
set_property(GLOBAL PROPERTY LLVM_LIT_PARAMS)
diff --git a/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll b/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
index 2e162f0f0005..9e706d62f8fc 100644
--- a/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
+++ b/test/Analysis/CostModel/X86/alternate-shuffle-cost.ll
@@ -207,7 +207,7 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
ret <8 x i16> %1
}
; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i16':
-; SSE2: Cost Model: {{.*}} 8 for instruction: %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
; SSSE3: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
@@ -219,7 +219,7 @@ define <8 x i16> @test_v8i16_2(<8 x i16> %a, <8 x i16> %b) {
ret <8 x i16> %1
}
; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v8i16_2':
-; SSE2: Cost Model: {{.*}} 8 for instruction: %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
; SSSE3: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
@@ -280,11 +280,11 @@ define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) {
ret <16 x i8> %1
}
; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i8':
-; SSE2: Cost Model: {{.*}} 48 for instruction: %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
; SSSE3: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
-; SSE41: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
-; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
-; AVX2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
define <16 x i8> @test_v16i8_2(<16 x i8> %a, <16 x i8> %b) {
@@ -292,11 +292,11 @@ define <16 x i8> @test_v16i8_2(<16 x i8> %a, <16 x i8> %b) {
ret <16 x i8> %1
}
; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i8_2':
-; SSE2: Cost Model: {{.*}} 48 for instruction: %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
; SSSE3: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
-; SSE41: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
-; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
-; AVX2: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
+; AVX: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
@@ -304,10 +304,10 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
ret <16 x i16> %1
}
; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i16':
-; SSE2: Cost Model: {{.*}} 16 for instruction: %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector
; SSSE3: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector
; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector
-; AVX: Cost Model: {{.*}} 5 for instruction: %1 = shufflevector
+; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
@@ -316,10 +316,10 @@ define <16 x i16> @test_v16i16_2(<16 x i16> %a, <16 x i16> %b) {
ret <16 x i16> %1
}
; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v16i16_2':
-; SSE2: Cost Model: {{.*}} 16 for instruction: %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector
; SSSE3: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector
; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector
-; AVX: Cost Model: {{.*}} 5 for instruction: %1 = shufflevector
+; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
define <32 x i8> @test_v32i8(<32 x i8> %a, <32 x i8> %b) {
@@ -327,11 +327,11 @@ define <32 x i8> @test_v32i8(<32 x i8> %a, <32 x i8> %b) {
ret <32 x i8> %1
}
; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v32i8':
-; SSE2: Cost Model: {{.*}} 96 for instruction: %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector
; SSSE3: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector
-; SSE41: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector
-; AVX: Cost Model: {{.*}} 9 for instruction: %1 = shufflevector
-; AVX2: Cost Model: {{.*}} 9 for instruction: %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector
+; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
define <32 x i8> @test_v32i8_2(<32 x i8> %a, <32 x i8> %b) {
@@ -339,9 +339,9 @@ define <32 x i8> @test_v32i8_2(<32 x i8> %a, <32 x i8> %b) {
ret <32 x i8> %1
}
; CHECK: Printing analysis 'Cost Model Analysis' for function 'test_v32i8_2':
-; SSE2: Cost Model: {{.*}} 96 for instruction: %1 = shufflevector
+; SSE2: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector
; SSSE3: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector
-; SSE41: Cost Model: {{.*}} 6 for instruction: %1 = shufflevector
-; AVX: Cost Model: {{.*}} 9 for instruction: %1 = shufflevector
-; AVX2: Cost Model: {{.*}} 9 for instruction: %1 = shufflevector
+; SSE41: Cost Model: {{.*}} 2 for instruction: %1 = shufflevector
+; AVX: Cost Model: {{.*}} 3 for instruction: %1 = shufflevector
+; AVX2: Cost Model: {{.*}} 1 for instruction: %1 = shufflevector
diff --git a/test/Analysis/RegionInfo/bad_node_traversal.ll b/test/Analysis/RegionInfo/bad_node_traversal.ll
new file mode 100644
index 000000000000..00dd1207af9f
--- /dev/null
+++ b/test/Analysis/RegionInfo/bad_node_traversal.ll
@@ -0,0 +1,43 @@
+; REQUIRES: asserts
+; RUN: opt -regions -analyze < %s | FileCheck %s
+
+; While working on improvements to the region info analysis, this test
+; case caused an incorrect region 3 => 8 to be detected.
+
+define internal i8 @wibble() {
+bb:
+ br i1 true, label %bb1, label %bb8
+
+bb1: ; preds = %bb
+ switch i32 0, label %bb2 [
+ i32 0, label %bb3
+ i32 1, label %bb7
+ ]
+
+bb2: ; preds = %bb1
+ br label %bb4
+
+bb3: ; preds = %bb1
+ br label %bb5
+
+bb4: ; preds = %bb2
+ br label %bb6
+
+bb5: ; preds = %bb3
+ br label %bb6
+
+bb6: ; preds = %bb5, %bb4
+ br label %bb7
+
+bb7: ; preds = %bb6, %bb1
+ br label %bb8
+
+bb8: ; preds = %bb7, %bb
+ ret i8 1
+}
+
+; CHECK: [0] bb => <Function Return>
+; CHECK-NEXT: [1] bb => bb8
+; CHECK-NEXT: [2] bb1 => bb7
+; CHECK-NEXT: End region tree
+
diff --git a/test/Bitcode/DIGlobalVariableExpression.ll b/test/Bitcode/DIGlobalVariableExpression.ll
index 0424a0e42a36..0bb0488b131f 100644
--- a/test/Bitcode/DIGlobalVariableExpression.ll
+++ b/test/Bitcode/DIGlobalVariableExpression.ll
@@ -1,5 +1,8 @@
; RUN: llvm-dis -o - %s.bc | FileCheck %s
+; RUN: llvm-dis -o - %s.bc | llvm-as - | llvm-bcanalyzer -dump - | FileCheck %s --check-prefix=BC
+; BC: GLOBAL_VAR_EXPR
+; BC: GLOBAL_DECL_ATTACHMENT
; CHECK: @g = common global i32 0, align 4, !dbg ![[G:[0-9]+]]
; CHECK: @h = common global i32 0, align 4, !dbg ![[H:[0-9]+]]
; CHECK: ![[G]] = {{.*}}!DIGlobalVariableExpression(var: ![[GVAR:[0-9]+]], expr: ![[GEXPR:[0-9]+]])
diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
index ae77f7e099db..412651c55678 100644
--- a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
+++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@@ -1,5 +1,6 @@
; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s -check-prefix=CYCLONE --check-prefix=ALL
; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s | FileCheck %s -check-prefix=KRYO --check-prefix=ALL
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=falkor < %s | FileCheck %s -check-prefix=FALKOR --check-prefix=ALL
; rdar://11481771
; rdar://13713797
@@ -16,6 +17,10 @@ entry:
; KRYO: movi v1.2d, #0000000000000000
; KRYO: movi v2.2d, #0000000000000000
; KRYO: movi v3.2d, #0000000000000000
+; FALKOR: movi v0.2d, #0000000000000000
+; FALKOR: movi v1.2d, #0000000000000000
+; FALKOR: movi v2.2d, #0000000000000000
+; FALKOR: movi v3.2d, #0000000000000000
tail call void @bar(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00) nounwind
ret void
}
@@ -47,6 +52,8 @@ define void @t4() nounwind ssp {
; CYCLONE: movi.2d v1, #0000000000000000
; KRYO: movi v0.2d, #0000000000000000
; KRYO: movi v1.2d, #0000000000000000
+; FALKOR: movi v0.2d, #0000000000000000
+; FALKOR: movi v1.2d, #0000000000000000
tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
ret void
}
diff --git a/test/CodeGen/AArch64/store_merge_pair_offset.ll b/test/CodeGen/AArch64/store_merge_pair_offset.ll
new file mode 100644
index 000000000000..a091f0fd911c
--- /dev/null
+++ b/test/CodeGen/AArch64/store_merge_pair_offset.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -enable-misched=false -enable-post-misched=false -o - %s | FileCheck %s
+
+define i64 @test(i64* %a) nounwind {
+ ; CHECK: ldp x{{[0-9]+}}, x{{[0-9]+}}
+ ; CHECK-NOT: ldr
+ %p1 = getelementptr inbounds i64, i64* %a, i32 64
+ %tmp1 = load i64, i64* %p1, align 2
+ %p2 = getelementptr inbounds i64, i64* %a, i32 63
+ %tmp2 = load i64, i64* %p2, align 2
+ %tmp3 = add i64 %tmp1, %tmp2
+ ret i64 %tmp3
+}
diff --git a/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll b/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll
new file mode 100644
index 000000000000..8d8885852afe
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}main:
+; GCN: s_mov_b32 m0, s0
+; VI-NEXT: s_nop 0
+; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GCN-NEXT: s_endpgm
+
+define amdgpu_gs void @main(i32 inreg %a) #0 {
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 %a)
+ ret void
+}
+
+; GCN-LABEL: {{^}}main_halt:
+; GCN: s_mov_b32 m0, s0
+; VI-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsghalt sendmsg(MSG_INTERRUPT)
+; GCN-NEXT: s_endpgm
+
+define void @main_halt(i32 inreg %a) #0 {
+ call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %a)
+ ret void
+}
+
+; GCN-LABEL: {{^}}legacy:
+; GCN: s_mov_b32 m0, s0
+; VI-NEXT: s_nop 0
+; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GCN-NEXT: s_endpgm
+
+define amdgpu_gs void @legacy(i32 inreg %a) #0 {
+ call void @llvm.SI.sendmsg(i32 3, i32 %a)
+ ret void
+}
+
+declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0
+declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0
+declare void @llvm.SI.sendmsg(i32, i32) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll b/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll
new file mode 100644
index 000000000000..31f9cfca6def
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll
@@ -0,0 +1,161 @@
+;RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}test_interrupt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT)
+define void @test_interrupt() {
+body:
+ call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0);
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_emit:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
+define void @test_gs_emit() {
+body:
+ call void @llvm.amdgcn.s.sendmsg(i32 34, i32 0);
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_cut:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
+define void @test_gs_cut() {
+body:
+ call void @llvm.amdgcn.s.sendmsg(i32 274, i32 0);
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_emit_cut:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
+define void @test_gs_emit_cut() {
+body:
+ call void @llvm.amdgcn.s.sendmsg(i32 562, i32 0)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_done:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+define void @test_gs_done() {
+body:
+ call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
+ ret void
+}
+
+
+; CHECK-LABEL: {{^}}test_interrupt_halt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsghalt sendmsg(MSG_INTERRUPT)
+define void @test_interrupt_halt() {
+body:
+ call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_emit_halt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT, 0)
+define void @test_gs_emit_halt() {
+body:
+ call void @llvm.amdgcn.s.sendmsghalt(i32 34, i32 0)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_cut_halt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_CUT, 1)
+define void @test_gs_cut_halt() {
+body:
+ call void @llvm.amdgcn.s.sendmsghalt(i32 274, i32 0)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_emit_cut_halt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
+define void @test_gs_emit_cut_halt() {
+body:
+ call void @llvm.amdgcn.s.sendmsghalt(i32 562, i32 0)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_gs_done_halt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsghalt sendmsg(MSG_GS_DONE, GS_OP_NOP)
+define void @test_gs_done_halt() {
+body:
+ call void @llvm.amdgcn.s.sendmsghalt(i32 3, i32 0)
+ ret void
+}
+
+; Legacy
+; CHECK-LABEL: {{^}}test_legacy_interrupt:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT)
+define void @test_legacy_interrupt() {
+body:
+ call void @llvm.SI.sendmsg(i32 1, i32 0)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_legacy_gs_emit:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
+define void @test_legacy_gs_emit() {
+body:
+ call void @llvm.SI.sendmsg(i32 34, i32 0)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_legacy_gs_cut:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
+define void @test_legacy_gs_cut() {
+body:
+ call void @llvm.SI.sendmsg(i32 274, i32 0)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_legacy_gs_emit_cut:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
+define void @test_legacy_gs_emit_cut() {
+body:
+ call void @llvm.SI.sendmsg(i32 562, i32 0)
+ ret void
+}
+
+; CHECK-LABEL: {{^}}test_legacy_gs_done:
+; CHECK: s_mov_b32 m0, 0
+; CHECK-NOT: s_mov_b32 m0
+; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+define void @test_legacy_gs_done() {
+body:
+ call void @llvm.SI.sendmsg(i32 3, i32 0)
+ ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0
+declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0
+declare void @llvm.SI.sendmsg(i32, i32) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
deleted file mode 100644
index 2d4987643a2b..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}main:
-; GCN: s_mov_b32 m0, s0
-; VI-NEXT: s_nop 0
-; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
-; GCN-NEXT: s_endpgm
-
-define amdgpu_gs void @main(i32 inreg %a) #0 {
- call void @llvm.SI.sendmsg(i32 3, i32 %a)
- ret void
-}
-
-declare void @llvm.SI.sendmsg(i32, i32) #0
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll b/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
deleted file mode 100644
index c4bb27676e7d..000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.sendmsg.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-; CHECK-LABEL: {{^}}main:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
-; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
-
-define void @main() {
-main_body:
- call void @llvm.SI.sendmsg(i32 34, i32 0);
- call void @llvm.SI.sendmsg(i32 274, i32 0);
- call void @llvm.SI.sendmsg(i32 562, i32 0);
- call void @llvm.SI.sendmsg(i32 3, i32 0);
- ret void
-}
-
-; Function Attrs: nounwind
-declare void @llvm.SI.sendmsg(i32, i32) #0
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/PowerPC/ppc64-blnop.ll b/test/CodeGen/PowerPC/ppc64-blnop.ll
new file mode 100644
index 000000000000..2fe23f91c83d
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-blnop.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -function-sections -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-FS
+; RUN: llc < %s -relocation-model=pic -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -function-sections -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-FS
+
+%class.T = type { [2 x i8] }
+
+define void @e_callee(%class.T* %this, i8* %c) { ret void }
+define void @e_caller(%class.T* %this, i8* %c) {
+ call void @e_callee(%class.T* %this, i8* %c)
+ ret void
+
+; CHECK-LABEL: e_caller:
+; CHECK: bl e_callee
+; CHECK-NEXT: nop
+
+; CHECK-FS-LABEL: e_caller:
+; CHECK-FS: bl e_callee
+; CHECK-FS-NEXT: nop
+}
+
+define void @e_scallee(%class.T* %this, i8* %c) section "different" { ret void }
+define void @e_scaller(%class.T* %this, i8* %c) {
+ call void @e_scallee(%class.T* %this, i8* %c)
+ ret void
+
+; CHECK-LABEL: e_scaller:
+; CHECK: bl e_scallee
+; CHECK-NEXT: nop
+}
+
+define void @e_s2callee(%class.T* %this, i8* %c) { ret void }
+define void @e_s2caller(%class.T* %this, i8* %c) section "different" {
+ call void @e_s2callee(%class.T* %this, i8* %c)
+ ret void
+
+; CHECK-LABEL: e_s2caller:
+; CHECK: bl e_s2callee
+; CHECK-NEXT: nop
+}
+
+$cd1 = comdat any
+$cd2 = comdat any
+
+define void @e_ccallee(%class.T* %this, i8* %c) comdat($cd1) { ret void }
+define void @e_ccaller(%class.T* %this, i8* %c) comdat($cd2) {
+ call void @e_ccallee(%class.T* %this, i8* %c)
+ ret void
+
+; CHECK-LABEL: e_ccaller:
+; CHECK: bl e_ccallee
+; CHECK-NEXT: nop
+}
+
+$cd = comdat any
+
+define void @e_c1callee(%class.T* %this, i8* %c) comdat($cd) { ret void }
+define void @e_c1caller(%class.T* %this, i8* %c) comdat($cd) {
+ call void @e_c1callee(%class.T* %this, i8* %c)
+ ret void
+
+; CHECK-LABEL: e_c1caller:
+; CHECK: bl e_c1callee
+; CHECK-NEXT: nop
+}
+
+define weak_odr hidden void @wo_hcallee(%class.T* %this, i8* %c) { ret void }
+define void @wo_hcaller(%class.T* %this, i8* %c) {
+ call void @wo_hcallee(%class.T* %this, i8* %c)
+ ret void
+
+; CHECK-LABEL: wo_hcaller:
+; CHECK: bl wo_hcallee
+; CHECK-NEXT: nop
+}
+
+define weak_odr protected void @wo_pcallee(%class.T* %this, i8* %c) { ret void }
+define void @wo_pcaller(%class.T* %this, i8* %c) {
+ call void @wo_pcallee(%class.T* %this, i8* %c)
+ ret void
+
+; CHECK-LABEL: wo_pcaller:
+; CHECK: bl wo_pcallee
+; CHECK-NEXT: nop
+}
+
+define weak_odr void @wo_callee(%class.T* %this, i8* %c) { ret void }
+define void @wo_caller(%class.T* %this, i8* %c) {
+ call void @wo_callee(%class.T* %this, i8* %c)
+ ret void
+
+; CHECK-LABEL: wo_caller:
+; CHECK: bl wo_callee
+; CHECK-NEXT: nop
+}
+
+define weak protected void @w_pcallee(i8* %ptr) { ret void }
+define void @w_pcaller(i8* %ptr) {
+ call void @w_pcallee(i8* %ptr)
+ ret void
+
+; CHECK-LABEL: w_pcaller:
+; CHECK: bl w_pcallee
+; CHECK-NEXT: nop
+}
+
+define weak hidden void @w_hcallee(i8* %ptr) { ret void }
+define void @w_hcaller(i8* %ptr) {
+ call void @w_hcallee(i8* %ptr)
+ ret void
+
+; CHECK-LABEL: w_hcaller:
+; CHECK: bl w_hcallee
+; CHECK-NEXT: nop
+}
+
+define weak void @w_callee(i8* %ptr) { ret void }
+define void @w_caller(i8* %ptr) {
+ call void @w_callee(i8* %ptr)
+ ret void
+
+; CHECK-LABEL: w_caller:
+; CHECK: bl w_callee
+; CHECK-NEXT: nop
+}
+
diff --git a/test/CodeGen/PowerPC/ppc64-sibcall.ll b/test/CodeGen/PowerPC/ppc64-sibcall.ll
index 418b7828f1d9..59e545601475 100644
--- a/test/CodeGen/PowerPC/ppc64-sibcall.ll
+++ b/test/CodeGen/PowerPC/ppc64-sibcall.ll
@@ -142,7 +142,7 @@ define void @wo_hcaller(%class.T* %this, i8* %c) {
ret void
; CHECK-SCO-LABEL: wo_hcaller:
-; CHECK-SCO: b wo_hcallee
+; CHECK-SCO: bl wo_hcallee
}
define weak_odr protected void @wo_pcallee(%class.T* %this, i8* %c) { ret void }
@@ -151,7 +151,7 @@ define void @wo_pcaller(%class.T* %this, i8* %c) {
ret void
; CHECK-SCO-LABEL: wo_pcaller:
-; CHECK-SCO: b wo_pcallee
+; CHECK-SCO: bl wo_pcallee
}
define weak_odr void @wo_callee(%class.T* %this, i8* %c) { ret void }
@@ -169,7 +169,7 @@ define void @w_pcaller(i8* %ptr) {
ret void
; CHECK-SCO-LABEL: w_pcaller:
-; CHECK-SCO: b w_pcallee
+; CHECK-SCO: bl w_pcallee
}
define weak hidden void @w_hcallee(i8* %ptr) { ret void }
@@ -178,7 +178,7 @@ define void @w_hcaller(i8* %ptr) {
ret void
; CHECK-SCO-LABEL: w_hcaller:
-; CHECK-SCO: b w_hcallee
+; CHECK-SCO: bl w_hcallee
}
define weak void @w_callee(i8* %ptr) { ret void }
diff --git a/test/CodeGen/SPARC/soft-float.ll b/test/CodeGen/SPARC/soft-float.ll
index 53ca1974659e..582804444f3b 100644
--- a/test/CodeGen/SPARC/soft-float.ll
+++ b/test/CodeGen/SPARC/soft-float.ll
@@ -45,21 +45,21 @@ define fp128 @test_multf3(fp128 %a, fp128 %b) #0 {
}
define float @test_subsf3(float %a, float %b) #0 {
- ; CHCEK-LABEL: test_subsf3:
+ ; CHECK-LABEL: test_subsf3:
; CHECK: call __subsf3
%sub = fsub float %a, %b
ret float %sub
}
define double @test_subdf3(double %a, double %b) #0 {
- ; CHCEK-LABEL: test_subdf3:
+ ; CHECK-LABEL: test_subdf3:
; CHECK: call __subdf3
%sub = fsub double %a, %b
ret double %sub
}
define fp128 @test_subtf3(fp128 %a, fp128 %b) #0 {
- ; CHCEK-LABEL: test_subtf3:
+ ; CHECK-LABEL: test_subtf3:
; CHECK: call __subtf3
%sub = fsub fp128 %a, %b
ret fp128 %sub
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
index b50253bf2b03..4d7cb765d7b9 100644
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -371,6 +371,40 @@ define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
}
; Make sure that we merge the consecutive load/store sequence below and use a
+; word (16 bit) instead of a byte copy for complicated address calculation.
+; .
+; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
+; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]]
+; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]]
+; CHECK: movw %[[REG]], (%{{.*}})
+define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) {
+ br label %1
+
+; <label>:1
+ %.09 = phi i64 [ 0, %0 ], [ %13, %1 ]
+ %.08 = phi i8* [ %b, %0 ], [ %12, %1 ]
+ %2 = load i8, i8* %.08, align 1
+ %3 = sext i8 %2 to i64
+ %4 = getelementptr inbounds i8, i8* %c, i64 %3
+ %5 = load i8, i8* %4, align 1
+ %6 = add nsw i64 %3, 1
+ %7 = getelementptr inbounds i8, i8* %c, i64 %6
+ %8 = load i8, i8* %7, align 1
+ %9 = getelementptr inbounds i8, i8* %a, i64 %.09
+ store i8 %5, i8* %9, align 1
+ %10 = or i64 %.09, 1
+ %11 = getelementptr inbounds i8, i8* %a, i64 %10
+ store i8 %8, i8* %11, align 1
+ %12 = getelementptr inbounds i8, i8* %.08, i64 1
+ %13 = add nuw nsw i64 %.09, 2
+ %14 = icmp slt i64 %13, %n
+ br i1 %14, label %1, label %15
+
+; <label>:15
+ ret void
+}
+
+; Make sure that we merge the consecutive load/store sequence below and use a
; word (16 bit) instead of a byte copy even if there are intermediate sign
; extensions.
; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetSext:
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 9b4d776b29e3..f65f485cc62c 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -209,34 +209,22 @@ entry:
}
define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp {
-; X32-AVX2-LABEL: QQ64:
-; X32-AVX2: ## BB#0: ## %entry
-; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX2-NEXT: movl (%eax), %ecx
-; X32-AVX2-NEXT: movl 4(%eax), %eax
-; X32-AVX2-NEXT: vmovd %ecx, %xmm0
-; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; X32-AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX2-NEXT: retl
+; X32-LABEL: QQ64:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl (%eax), %ecx
+; X32-NEXT: movl 4(%eax), %eax
+; X32-NEXT: vmovd %ecx, %xmm0
+; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
+; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
;
; X64-LABEL: QQ64:
; X64: ## BB#0: ## %entry
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
-;
-; X32-AVX512VL-LABEL: QQ64:
-; X32-AVX512VL: ## BB#0: ## %entry
-; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512VL-NEXT: movl (%eax), %ecx
-; X32-AVX512VL-NEXT: movl 4(%eax), %eax
-; X32-AVX512VL-NEXT: vmovd %ecx, %xmm0
-; X32-AVX512VL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; X32-AVX512VL-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; X32-AVX512VL-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; X32-AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512VL-NEXT: retl
entry:
%q = load i64, i64* %ptr, align 4
%q0 = insertelement <4 x i64> undef, i64 %q, i32 0
@@ -1105,55 +1093,30 @@ define <4 x double> @splat_concat4(double %d) {
; Those test cases exerce the latter.
define void @isel_crash_16b(i8* %cV_R.addr) {
-; X32-AVX2-LABEL: isel_crash_16b:
-; X32-AVX2: ## BB#0: ## %eintry
-; X32-AVX2-NEXT: subl $60, %esp
-; X32-AVX2-NEXT: Lcfi0:
-; X32-AVX2-NEXT: .cfi_def_cfa_offset 64
-; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX2-NEXT: vmovaps %xmm0, (%esp)
-; X32-AVX2-NEXT: vpbroadcastb (%eax), %xmm1
-; X32-AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT: addl $60, %esp
-; X32-AVX2-NEXT: retl
-;
-; X64-AVX2-LABEL: isel_crash_16b:
-; X64-AVX2: ## BB#0: ## %eintry
-; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT: movb (%rdi), %al
-; X64-AVX2-NEXT: vmovd %eax, %xmm1
-; X64-AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT: retq
-;
-; X32-AVX512VL-LABEL: isel_crash_16b:
-; X32-AVX512VL: ## BB#0: ## %eintry
-; X32-AVX512VL-NEXT: subl $60, %esp
-; X32-AVX512VL-NEXT: Lcfi0:
-; X32-AVX512VL-NEXT: .cfi_def_cfa_offset 64
-; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX512VL-NEXT: vmovaps %xmm0, (%esp)
-; X32-AVX512VL-NEXT: vpbroadcastb (%eax), %xmm1
-; X32-AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT: addl $60, %esp
-; X32-AVX512VL-NEXT: retl
+; X32-LABEL: isel_crash_16b:
+; X32: ## BB#0: ## %eintry
+; X32-NEXT: subl $60, %esp
+; X32-NEXT: Lcfi0:
+; X32-NEXT: .cfi_def_cfa_offset 64
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X32-NEXT: vmovaps %xmm0, (%esp)
+; X32-NEXT: vpbroadcastb (%eax), %xmm1
+; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT: addl $60, %esp
+; X32-NEXT: retl
;
-; X64-AVX512VL-LABEL: isel_crash_16b:
-; X64-AVX512VL: ## BB#0: ## %eintry
-; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT: movb (%rdi), %al
-; X64-AVX512VL-NEXT: vmovd %eax, %xmm1
-; X64-AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
-; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT: retq
+; X64-LABEL: isel_crash_16b:
+; X64: ## BB#0: ## %eintry
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movb (%rdi), %al
+; X64-NEXT: vmovd %eax, %xmm1
+; X64-NEXT: vpbroadcastb %xmm1, %xmm1
+; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: retq
eintry:
%__a.addr.i = alloca <2 x i64>, align 16
%__b.addr.i = alloca <2 x i64>, align 16
@@ -1277,55 +1240,30 @@ eintry:
}
define void @isel_crash_8w(i16* %cV_R.addr) {
-; X32-AVX2-LABEL: isel_crash_8w:
-; X32-AVX2: ## BB#0: ## %entry
-; X32-AVX2-NEXT: subl $60, %esp
-; X32-AVX2-NEXT: Lcfi4:
-; X32-AVX2-NEXT: .cfi_def_cfa_offset 64
-; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX2-NEXT: vmovaps %xmm0, (%esp)
-; X32-AVX2-NEXT: vpbroadcastw (%eax), %xmm1
-; X32-AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT: addl $60, %esp
-; X32-AVX2-NEXT: retl
-;
-; X64-AVX2-LABEL: isel_crash_8w:
-; X64-AVX2: ## BB#0: ## %entry
-; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT: movw (%rdi), %ax
-; X64-AVX2-NEXT: vmovd %eax, %xmm1
-; X64-AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
-; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT: retq
-;
-; X32-AVX512VL-LABEL: isel_crash_8w:
-; X32-AVX512VL: ## BB#0: ## %entry
-; X32-AVX512VL-NEXT: subl $60, %esp
-; X32-AVX512VL-NEXT: Lcfi4:
-; X32-AVX512VL-NEXT: .cfi_def_cfa_offset 64
-; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX512VL-NEXT: vmovaps %xmm0, (%esp)
-; X32-AVX512VL-NEXT: vpbroadcastw (%eax), %xmm1
-; X32-AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT: addl $60, %esp
-; X32-AVX512VL-NEXT: retl
+; X32-LABEL: isel_crash_8w:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: subl $60, %esp
+; X32-NEXT: Lcfi4:
+; X32-NEXT: .cfi_def_cfa_offset 64
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X32-NEXT: vmovaps %xmm0, (%esp)
+; X32-NEXT: vpbroadcastw (%eax), %xmm1
+; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT: addl $60, %esp
+; X32-NEXT: retl
;
-; X64-AVX512VL-LABEL: isel_crash_8w:
-; X64-AVX512VL: ## BB#0: ## %entry
-; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT: movw (%rdi), %ax
-; X64-AVX512VL-NEXT: vmovd %eax, %xmm1
-; X64-AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
-; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT: retq
+; X64-LABEL: isel_crash_8w:
+; X64: ## BB#0: ## %entry
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movw (%rdi), %ax
+; X64-NEXT: vmovd %eax, %xmm1
+; X64-NEXT: vpbroadcastw %xmm1, %xmm1
+; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NEXT: retq
entry:
%__a.addr.i = alloca <2 x i64>, align 16
%__b.addr.i = alloca <2 x i64>, align 16
@@ -1605,24 +1543,24 @@ eintry:
}
define void @isel_crash_2q(i64* %cV_R.addr) {
-; X32-AVX2-LABEL: isel_crash_2q:
-; X32-AVX2: ## BB#0: ## %entry
-; X32-AVX2-NEXT: subl $60, %esp
-; X32-AVX2-NEXT: Lcfi12:
-; X32-AVX2-NEXT: .cfi_def_cfa_offset 64
-; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX2-NEXT: vmovaps %xmm0, (%esp)
-; X32-AVX2-NEXT: movl (%eax), %ecx
-; X32-AVX2-NEXT: movl 4(%eax), %eax
-; X32-AVX2-NEXT: vmovd %ecx, %xmm1
-; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
-; X32-AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
-; X32-AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
-; X32-AVX2-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT: addl $60, %esp
-; X32-AVX2-NEXT: retl
+; X32-LABEL: isel_crash_2q:
+; X32: ## BB#0: ## %entry
+; X32-NEXT: subl $60, %esp
+; X32-NEXT: Lcfi12:
+; X32-NEXT: .cfi_def_cfa_offset 64
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X32-NEXT: vmovaps %xmm0, (%esp)
+; X32-NEXT: movl (%eax), %ecx
+; X32-NEXT: movl 4(%eax), %eax
+; X32-NEXT: vmovd %ecx, %xmm1
+; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; X32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
+; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
+; X32-NEXT: addl $60, %esp
+; X32-NEXT: retl
;
; X64-AVX2-LABEL: isel_crash_2q:
; X64-AVX2: ## BB#0: ## %entry
@@ -1635,25 +1573,6 @@ define void @isel_crash_2q(i64* %cV_R.addr) {
; X64-AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: retq
;
-; X32-AVX512VL-LABEL: isel_crash_2q:
-; X32-AVX512VL: ## BB#0: ## %entry
-; X32-AVX512VL-NEXT: subl $60, %esp
-; X32-AVX512VL-NEXT: Lcfi12:
-; X32-AVX512VL-NEXT: .cfi_def_cfa_offset 64
-; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; X32-AVX512VL-NEXT: vmovaps %xmm0, (%esp)
-; X32-AVX512VL-NEXT: movl (%eax), %ecx
-; X32-AVX512VL-NEXT: movl 4(%eax), %eax
-; X32-AVX512VL-NEXT: vmovd %ecx, %xmm1
-; X32-AVX512VL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
-; X32-AVX512VL-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
-; X32-AVX512VL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
-; X32-AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT: addl $60, %esp
-; X32-AVX512VL-NEXT: retl
-;
; X64-AVX512VL-LABEL: isel_crash_2q:
; X64-AVX512VL: ## BB#0: ## %entry
; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
@@ -1752,7 +1671,7 @@ define void @isel_crash_4q(i64* %cV_R.addr) {
; X32-AVX512VL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
; X32-AVX512VL-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
; X32-AVX512VL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
-; X32-AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm1, %ymm1
+; X32-AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
; X32-AVX512VL-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
; X32-AVX512VL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp)
; X32-AVX512VL-NEXT: movl %ebp, %esp
diff --git a/test/CodeGen/X86/avx512-any_extend_load.ll b/test/CodeGen/X86/avx512-any_extend_load.ll
index 656b618eff55..87f8cc9a418e 100644
--- a/test/CodeGen/X86/avx512-any_extend_load.ll
+++ b/test/CodeGen/X86/avx512-any_extend_load.ll
@@ -22,10 +22,8 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
; KNL-LABEL: any_extend_load_v8i32:
; KNL: # BB#0:
-; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; KNL-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovq %xmm0, (%rdi)
; KNL-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-extract-subvector.ll b/test/CodeGen/X86/avx512-extract-subvector.ll
index 9e8662452822..391bf6ba4554 100644
--- a/test/CodeGen/X86/avx512-extract-subvector.ll
+++ b/test/CodeGen/X86/avx512-extract-subvector.ll
@@ -60,7 +60,7 @@ define <32 x i8> @extract_subvector256_v64i8(<64 x i8> %x) nounwind {
define void @extract_subvector256_v8f64_store(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v8f64_store:
; SKX: ## BB#0: ## %entry
-; SKX-NEXT: vextractf64x2 $1, %ymm0, (%rdi)
+; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3>
@@ -72,7 +72,7 @@ entry:
define void @extract_subvector256_v8f32_store(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v8f32_store:
; SKX: ## BB#0: ## %entry
-; SKX-NEXT: vextractf32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -84,7 +84,7 @@ entry:
define void @extract_subvector256_v4i64_store(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v4i64_store:
; SKX: ## BB#0: ## %entry
-; SKX-NEXT: vextracti64x2 $1, %ymm0, (%rdi)
+; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
; SKX-NEXT: retq
entry:
%0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
@@ -96,7 +96,7 @@ entry:
define void @extract_subvector256_v8i32_store(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v8i32_store:
; SKX: ## BB#0: ## %entry
-; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
; SKX-NEXT: retq
entry:
%0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -108,7 +108,7 @@ entry:
define void @extract_subvector256_v16i16_store(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v16i16_store:
; SKX: ## BB#0: ## %entry
-; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
; SKX-NEXT: retq
entry:
%0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -120,7 +120,7 @@ entry:
define void @extract_subvector256_v32i8_store(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v32i8_store:
; SKX: ## BB#0: ## %entry
-; SKX-NEXT: vextracti32x4 $1, %ymm0, (%rdi)
+; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
; SKX-NEXT: retq
entry:
%0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index c6cc74289971..26d14fa0840f 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -463,7 +463,7 @@ define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
; SKX-LABEL: extract_v4i64:
; SKX: ## BB#0:
; SKX-NEXT: vpextrq $1, %xmm0, %rax
-; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm0
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
; SKX-NEXT: retq
%r1 = extractelement <4 x i64> %x, i32 1
@@ -521,7 +521,7 @@ define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
; SKX-LABEL: extract_v8i32:
; SKX: ## BB#0:
; SKX-NEXT: vpextrd $1, %xmm0, %eax
-; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrd $1, %xmm0, (%rdi)
; SKX-NEXT: retq
%r1 = extractelement <8 x i32> %x, i32 1
@@ -582,7 +582,7 @@ define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
; SKX-LABEL: extract_v16i16:
; SKX: ## BB#0:
; SKX-NEXT: vpextrw $1, %xmm0, %eax
-; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SKX-NEXT: retq
@@ -646,7 +646,7 @@ define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
; SKX-LABEL: extract_v32i8:
; SKX: ## BB#0:
; SKX-NEXT: vpextrb $1, %xmm0, %eax
-; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
@@ -714,9 +714,9 @@ define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
; SKX: ## BB#0:
; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
-; SKX-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
%val = load i64, i64* %ptr
%r1 = insertelement <4 x i64> %x, i64 %val, i32 1
@@ -780,9 +780,9 @@ define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
; SKX: ## BB#0:
; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1
-; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
%val = load i32, i32* %ptr
%r1 = insertelement <8 x i32> %x, i32 %val, i32 1
@@ -846,9 +846,9 @@ define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
; SKX: ## BB#0:
; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrw $1, %edi, %xmm1, %xmm1
-; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
%val = load i16, i16* %ptr
%r1 = insertelement <16 x i16> %x, i16 %val, i32 1
@@ -912,9 +912,9 @@ define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) {
; SKX: ## BB#0:
; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrb $1, %edi, %xmm1, %xmm1
-; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
%val = load i8, i8* %ptr
%r1 = insertelement <32 x i8> %x, i8 %val, i32 1
@@ -1014,9 +1014,9 @@ define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) {
;
; SKX-LABEL: test_insert_128_v16i16:
; SKX: ## BB#0:
-; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1
-; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
%r = insertelement <16 x i16> %x, i16 %y, i32 10
ret <16 x i16> %r
@@ -1032,9 +1032,9 @@ define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) {
;
; SKX-LABEL: test_insert_128_v32i8:
; SKX: ## BB#0:
-; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
; SKX-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
-; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; SKX-NEXT: retq
%r = insertelement <32 x i8> %x, i8 %y, i32 20
ret <32 x i8> %r
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index f422a0354988..3c649e18bc38 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -2868,3 +2868,187 @@ define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask
}
declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
+
+define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
+; CHECK-LABEL: test_mask_vextractf32x4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: kshiftlw $12, %k1, %k0
+; CHECK-NEXT: kshiftrw $15, %k0, %k0
+; CHECK-NEXT: kshiftlw $13, %k1, %k2
+; CHECK-NEXT: kshiftrw $15, %k2, %k2
+; CHECK-NEXT: kshiftlw $15, %k1, %k3
+; CHECK-NEXT: kshiftrw $15, %k3, %k3
+; CHECK-NEXT: kshiftlw $14, %k1, %k1
+; CHECK-NEXT: kshiftrw $15, %k1, %k1
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: kmovw %k3, %ecx
+; CHECK-NEXT: vmovd %ecx, %xmm2
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2
+; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
+ ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8)
+
+define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
+; CHECK-LABEL: test_mask_vextracti64x4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: kshiftlw $12, %k1, %k0
+; CHECK-NEXT: kshiftrw $15, %k0, %k0
+; CHECK-NEXT: kshiftlw $13, %k1, %k2
+; CHECK-NEXT: kshiftrw $15, %k2, %k2
+; CHECK-NEXT: kshiftlw $15, %k1, %k3
+; CHECK-NEXT: kshiftrw $15, %k3, %k3
+; CHECK-NEXT: kshiftlw $14, %k1, %k1
+; CHECK-NEXT: kshiftrw $15, %k1, %k1
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: kmovw %k3, %ecx
+; CHECK-NEXT: vmovd %ecx, %xmm2
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2
+; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
+; CHECK-NEXT: vpmovsxdq %xmm2, %ymm2
+; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask)
+ ret <4 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8)
+
+define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
+; CHECK-LABEL: test_maskz_vextracti32x4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: kshiftlw $12, %k1, %k0
+; CHECK-NEXT: kshiftrw $15, %k0, %k0
+; CHECK-NEXT: kshiftlw $13, %k1, %k2
+; CHECK-NEXT: kshiftrw $15, %k2, %k2
+; CHECK-NEXT: kshiftlw $15, %k1, %k3
+; CHECK-NEXT: kshiftrw $15, %k3, %k3
+; CHECK-NEXT: kshiftlw $14, %k1, %k1
+; CHECK-NEXT: kshiftrw $15, %k1, %k1
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: kmovw %k3, %ecx
+; CHECK-NEXT: vmovd %ecx, %xmm1
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; CHECK-NEXT: kmovw %k2, %eax
+; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
+; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT: vpsrad $31, %xmm1, %xmm1
+; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
+ ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8)
+
+define <4 x double> @test_vextractf64x4(<8 x double> %a) {
+; CHECK-LABEL: test_vextractf64x4:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; CHECK-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1)
+ ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
+
+declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res2, %res3
+ ret <16 x float> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res2, %res3
+ ret <16 x i32> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res2, %res3
+ ret <8 x double> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
+; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res2, %res3
+ ret <8 x i64> %res4
+}
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 5442693806f3..3015a2b499ff 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -1243,53 +1243,6 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
-define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
-; CHECK-LABEL: test_mask_vextractf32x4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm0 {%k1}
-; CHECK-NEXT: retq
- %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
- ret <4 x float> %res
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8)
-
-define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
-; CHECK-LABEL: test_mask_vextracti64x4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vextracti64x4 $2, %zmm1, %ymm0 {%k1}
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 2, <4 x i64> %b, i8 %mask)
- ret <4 x i64> %res
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8)
-
-define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
-; CHECK-LABEL: test_maskz_vextracti32x4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
- ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8)
-
-define <4 x double> @test_vextractf64x4(<8 x double> %a) {
-; CHECK-LABEL: test_vextractf64x4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vextractf64x4 $2, %zmm0, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 2, <4 x double> zeroinitializer, i8 -1)
- ret <4 x double> %res
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
-
declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
@@ -3984,86 +3937,6 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<1
ret <16 x float> %res2
}
-declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
- %res3 = fadd <16 x float> %res, %res1
- %res4 = fadd <16 x float> %res2, %res3
- ret <16 x float> %res4
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
- %res3 = add <16 x i32> %res, %res1
- %res4 = add <16 x i32> %res2, %res3
- ret <16 x i32> %res4
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
- %res3 = fadd <8 x double> %res, %res1
- %res4 = fadd <8 x double> %res2, %res3
- ret <8 x double> %res4
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
- %res3 = add <8 x i64> %res, %res1
- %res4 = add <8 x i64> %res2, %res3
- ret <8 x i64> %res4
-}
-
declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32)
define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index a5bceb7670a0..2200f1159880 100644
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -30,9 +30,9 @@ define <8 x i1> @test2(<2 x i1> %a) {
; CHECK: # BB#0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
-; CHECK-NEXT: vpmovm2q %k0, %zmm0
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT: vpmovm2q %k0, %zmm1
+; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; CHECK-NEXT: vpmovq2m %zmm0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-vbroadcasti128.ll b/test/CodeGen/X86/avx512-vbroadcasti128.ll
index 09c48ddf81a1..ad8a29cacd82 100644
--- a/test/CodeGen/X86/avx512-vbroadcasti128.ll
+++ b/test/CodeGen/X86/avx512-vbroadcasti128.ll
@@ -237,7 +237,7 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
; X64-AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X64-AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
-; X64-AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: PR29088:
@@ -245,7 +245,7 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
; X64-AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512BWVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X64-AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
-; X64-AVX512BWVL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512BWVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512BWVL-NEXT: retq
;
; X64-AVX512DQVL-LABEL: PR29088:
@@ -253,7 +253,7 @@ define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512DQVL-NEXT: vxorps %ymm1, %ymm1, %ymm1
; X64-AVX512DQVL-NEXT: vmovaps %ymm1, (%rsi)
-; X64-AVX512DQVL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512DQVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512DQVL-NEXT: retq
%ld = load <4 x i32>, <4 x i32>* %p0
store <8 x float> zeroinitializer, <8 x float>* %p1
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index 6fd111577440..7a9d7d7885ff 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -30,7 +30,7 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
-; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
@@ -79,7 +79,7 @@ define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
-; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
@@ -129,7 +129,7 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
-; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
@@ -178,7 +178,7 @@ define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask)
; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
-; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x38,0xc0,0x01]
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
%vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
new file mode 100644
index 000000000000..f4cf22c5ed3a
--- /dev/null
+++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s
+
+declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0
+; CHECK-NEXT: kmovb %edi, %k0
+; CHECK-NEXT: kshiftlb $7, %k0, %k1
+; CHECK-NEXT: kshiftrb $7, %k1, %k1
+; CHECK-NEXT: kshiftlb $6, %k0, %k0
+; CHECK-NEXT: kshiftrb $7, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: vmovq %rax, %xmm2
+; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: vmovq %rax, %xmm3
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; CHECK-NEXT: vpsllq $63, %xmm2, %xmm2
+; CHECK-NEXT: vpsrad $31, %xmm2, %xmm2
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
+; CHECK-NEXT: vandpd %xmm0, %xmm2, %xmm2
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res2, %res3
+ ret <2 x double> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res2, %res3
+ ret <8 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res2, %res3
+ ret <16 x float> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm3
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>, i32, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: vpaddd %zmm3, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res3, %res2
+ ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i32, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm3
+; CHECK-NEXT: kmovb %edi, %k1
+; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res2, %res3
+ ret <8 x i64> %res4
+}
diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll
index 5826bb6fad23..375d63264517 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics.ll
@@ -325,127 +325,6 @@ define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x dou
ret <2 x double> %res2
}
-
-declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
- %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1)
- %res3 = fadd <2 x double> %res, %res1
- %res4 = fadd <2 x double> %res2, %res3
- ret <2 x double> %res4
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1}
-; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1)
- %res3 = fadd <8 x float> %res, %res1
- %res4 = fadd <8 x float> %res2, %res3
- ret <8 x float> %res4
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x float>, i32, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
- %res3 = fadd <16 x float> %res, %res1
- %res4 = fadd <16 x float> %res2, %res3
- ret <16 x float> %res4
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x double>, i32, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
- %res3 = fadd <8 x double> %res, %res1
- %res4 = fadd <8 x double> %res3, %res2
- ret <8 x double> %res4
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>, i32, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
- %res3 = add <16 x i32> %res, %res1
- %res4 = add <16 x i32> %res3, %res2
- ret <16 x i32> %res4
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i32, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1
-; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
- %res3 = add <8 x i64> %res, %res1
- %res4 = add <8 x i64> %res2, %res3
- ret <8 x i64> %res4
-}
-
declare i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double>, i32, i8)
define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) {
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
index 9bf989df22a3..f8460bf880f9 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
@@ -1560,3 +1560,62 @@ define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8
declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
+; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01]
+; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
+; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3)
+ %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res3, %res2
+ ret <2 x double> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01]
+; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb]
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res2, %res3
+ ret <4 x double> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i32, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
+; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01]
+; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res3, %res2
+ ret <4 x i64> %res4
+}
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
index eb9c6b64bcf6..3430c5715376 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
@@ -549,66 +549,6 @@ define <8 x float>@test_int_x86_avx512_mask_range_ps_256(<8 x float> %x0, <8 x f
ret <8 x float> %res2
}
-declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
-; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
-; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc2,0x01]
-; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x19,0xc0,0x01]
-; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
-; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3)
- %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3)
- %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1)
- %res3 = fadd <2 x double> %res, %res1
- %res4 = fadd <2 x double> %res3, %res2
- ret <2 x double> %res4
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x double>, i32, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
-; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
-; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xd9,0x01]
-; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x18,0xc1,0x01]
-; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0]
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1)
- %res2 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4)
- %res3 = fadd <4 x double> %res, %res1
- %res4 = fadd <4 x double> %res2, %res3
- ret <4 x double> %res4
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i32, <4 x i64>, i8)
-
-define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
-; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
-; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xd9,0x01]
-; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x38,0xc1,0x01]
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
-; CHECK-NEXT: vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4)
- %res3 = add <4 x i64> %res, %res1
- %res4 = add <4 x i64> %res3, %res2
- ret <4 x i64> %res4
-}
-
declare i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float>, i32, i8)
define i8 @test_int_x86_avx512_mask_fpclass_ps_128(<4 x float> %x0, i8 %x1) {
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index 8d44af7b7a4c..c63d47d780d1 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -4773,3 +4773,63 @@ define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, <
ret <4 x float> %res4
}
+declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01]
+; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc0,0x01]
+; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0]
+; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3)
+ %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res2, %res3
+ ret <4 x float> %res4
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01]
+; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01]
+; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res2, %res3
+ ret <8 x float> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i32, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01]
+; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01]
+; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+
+ %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+}
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index 94095f549e51..82014283246e 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -3621,26 +3621,6 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64
ret <4 x i64> %res2
}
-declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01]
-; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc2,0x01]
-; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x19,0xc0,0x01]
-; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
-; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3)
- %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3)
- %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1)
- %res3 = fadd <4 x float> %res, %res1
- %res4 = fadd <4 x float> %res2, %res3
- ret <4 x float> %res4
-}
-
declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2 x double>, i8)
define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
@@ -3709,47 +3689,6 @@ define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x
ret <8 x float> %res2
}
-declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x float>, i32, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01]
-; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xd9,0x01]
-; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x18,0xc1,0x01]
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0]
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4)
- %res3 = fadd <8 x float> %res, %res1
- %res4 = fadd <8 x float> %res2, %res3
- ret <8 x float> %res4
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i32, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01]
-; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xd9,0x01]
-; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x38,0xc1,0x01]
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
-; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
-
- %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4)
- %res3 = add <8 x i32> %res, %res1
- %res4 = add <8 x i32> %res2, %res3
- ret <8 x i32> %res4
-}
-
declare <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8)
define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) {
diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
new file mode 100644
index 000000000000..ab797e04b400
--- /dev/null
+++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
@@ -0,0 +1,72 @@
+; Test ensuring debug intrinsics do not affect generated function prologue.
+;
+; RUN: llc -O1 -mtriple=x86_64-unknown-unknown -o - %s | FileCheck %s
+
+@a = local_unnamed_addr global i64 0, align 8
+
+define void @noDebug() {
+entry:
+ %0 = load i64, i64* @a, align 8
+ %1 = load i64, i64* @a, align 8
+ %2 = load i64, i64* @a, align 8
+ %3 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %0, i64 %1)
+ %4 = extractvalue { i64, i1 } %3, 0
+ %5 = tail call i64 @fn1(i64 %4, i64 %2)
+ tail call void (...) @printf()
+ tail call void (...) @printf(i64 1, i64 2, i64 3, i64 4, i32 0, i64 0, i64 %4, i64 %5)
+ ret void
+}
+
+; CHECK-LABEL: noDebug
+; CHECK: addq $24, %rsp
+; CHECK: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: retq
+
+
+define void @withDebug() !dbg !18 {
+entry:
+ %0 = load i64, i64* @a, align 8
+ %1 = load i64, i64* @a, align 8
+ %2 = load i64, i64* @a, align 8
+ %3 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %0, i64 %1)
+ %4 = extractvalue { i64, i1 } %3, 0
+ %5 = tail call i64 @fn1(i64 %4, i64 %2)
+ tail call void @llvm.dbg.value(metadata i64 %4, i64 0, metadata !23, metadata !33), !dbg !34
+ tail call void @llvm.dbg.value(metadata i64 %5, i64 0, metadata !22, metadata !33), !dbg !35
+ tail call void (...) @printf()
+ tail call void (...) @printf(i64 1, i64 2, i64 3, i64 4, i32 0, i64 0, i64 %4, i64 %5)
+ ret void
+}
+
+; CHECK-LABEL: withDebug
+; CHECK: #DEBUG_VALUE: test:j <- %RBX
+; CHECK-NEXT: addq $24, %rsp
+; CHECK: popq %rbx
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: retq
+
+declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64)
+declare i64 @fn1(i64, i64)
+
+declare void @printf(...)
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+
+!llvm.dbg.cu = !{!1}
+!llvm.module.flags = !{!15, !16}
+
+!1 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !2, producer: "clang version 4.0.0")
+!2 = !DIFile(filename: "test.cpp", directory: "")
+!11 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+!15 = !{i32 2, !"Dwarf Version", i32 4}
+!16 = !{i32 2, !"Debug Info Version", i32 3}
+!18 = distinct !DISubprogram(name: "test", scope: !2, file: !2, line: 5, unit: !1)
+!22 = !DILocalVariable(name: "i", scope: !18, file: !2, line: 6, type: !11)
+!23 = !DILocalVariable(name: "j", scope: !18, file: !2, line: 7, type: !11)
+!33 = !DIExpression()
+!34 = !DILocation(line: 7, column: 17, scope: !18)
+!35 = !DILocation(line: 6, column: 8, scope: !18)
+!36 = !DILocation(line: 9, column: 3, scope: !18)
+!37 = !DILocation(line: 10, column: 10, scope: !18)
diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
index 8614d1b4c6c3..e86d094ac341 100644
--- a/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
+++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic.ll
@@ -2,31 +2,56 @@
;
; RUN: llc -O1 -mtriple=x86_64-unknown-unknown -o - %s | FileCheck %s
-
-define i64 @noDebug(i64 %a) {
+define i64 @fn1NoDebug(i64 %a) {
%call = call i64 @fn(i64 %a, i64 0)
ret i64 %call
}
-; CHECK-LABEL: noDebug
+; CHECK-LABEL: fn1NoDebug
; CHECK: popq %rcx
-; CHECK: ret
-
+; CHECK-NEXT: ret
-define i64 @withDebug(i64 %a) !dbg !4 {
+define i64 @fn1WithDebug(i64 %a) !dbg !4 {
%call = call i64 @fn(i64 %a, i64 0)
tail call void @llvm.dbg.value(metadata i64 %call, i64 0, metadata !5, metadata !6), !dbg !7
ret i64 %call
}
-; CHECK-LABEL: withDebug
+; CHECK-LABEL: fn1WithDebug
; CHECK: popq %rcx
-; CHECK: ret
+; CHECK-NEXT: ret
+
+%struct.Buffer = type { i8, [63 x i8] }
+
+define void @fn2NoDebug(%struct.Buffer* byval align 64 %p1) {
+ ret void
+}
+
+; CHECK-LABEL: fn2NoDebug
+; CHECK: and
+; CHECK-NOT: add
+; CHECK-NOT: sub
+; CHECK: mov
+; CHECK-NEXT: pop
+; CHECK-NEXT: ret
+
+define void @fn2WithDebug(%struct.Buffer* byval align 64 %p1) !dbg !4 {
+ call void @llvm.dbg.declare(metadata %struct.Buffer* %p1, metadata !5, metadata !6), !dbg !7
+ ret void
+}
+; CHECK-LABEL: fn2WithDebug
+; CHECK: and
+; CHECK-NOT: add
+; CHECK-NOT: sub
+; CHECK: mov
+; CHECK-NEXT: pop
+; CHECK-NEXT: ret
declare i64 @fn(i64, i64)
declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!2,!3}
diff --git a/test/CodeGen/X86/i64-to-float.ll b/test/CodeGen/X86/i64-to-float.ll
index 8898551a9764..da92bdb55d7c 100644
--- a/test/CodeGen/X86/i64-to-float.ll
+++ b/test/CodeGen/X86/i64-to-float.ll
@@ -71,34 +71,32 @@ define <2 x double> @mask_uitofp_2i64_2f64(<2 x i64> %a) nounwind {
define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
; X32-SSE-LABEL: mask_sitofp_4i64_4f32:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: mask_sitofp_4i64_4f32:
; X32-AVX: # BB#0:
-; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-AVX-NEXT: vzeroupper
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: mask_sitofp_4i64_4f32:
; X64-SSE: # BB#0:
-; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1
-; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mask_sitofp_4i64_4f32:
; X64-AVX: # BB#0:
-; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
@@ -110,34 +108,32 @@ define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind {
; X32-SSE-LABEL: mask_uitofp_4i64_4f32:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: mask_uitofp_4i64_4f32:
; X32-AVX: # BB#0:
-; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-AVX-NEXT: vzeroupper
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: mask_uitofp_4i64_4f32:
; X64-SSE: # BB#0:
-; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1
-; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mask_uitofp_4i64_4f32:
; X64-AVX: # BB#0:
-; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
index cba9a221f774..4e65b169c7e6 100644
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -1009,7 +1009,7 @@ define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
;
; SKX-LABEL: one_mask_bit_set3:
; SKX: ## BB#0:
-; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vmovq %xmm0, 16(%rdi)
; SKX-NEXT: retq
call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
@@ -1026,17 +1026,11 @@ define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
-; AVX512F-LABEL: one_mask_bit_set4:
-; AVX512F: ## BB#0:
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovhpd %xmm0, 24(%rdi)
-; AVX512F-NEXT: retq
-;
-; SKX-LABEL: one_mask_bit_set4:
-; SKX: ## BB#0:
-; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm0
-; SKX-NEXT: vmovhpd %xmm0, 24(%rdi)
-; SKX-NEXT: retq
+; AVX512-LABEL: one_mask_bit_set4:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi)
+; AVX512-NEXT: retq
call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
ret void
}
@@ -1109,19 +1103,12 @@ define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: load_one_mask_bit_set3:
-; AVX512F: ## BB#0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; SKX-LABEL: load_one_mask_bit_set3:
-; SKX: ## BB#0:
-; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; SKX-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
-; SKX-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; SKX-NEXT: retq
+; AVX512-LABEL: load_one_mask_bit_set3:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
ret <4 x i64> %res
}
@@ -1136,19 +1123,12 @@ define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %v
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
-; AVX512F-LABEL: load_one_mask_bit_set4:
-; AVX512F: ## BB#0:
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; SKX-LABEL: load_one_mask_bit_set4:
-; SKX: ## BB#0:
-; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm1
-; SKX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; SKX-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
-; SKX-NEXT: retq
+; AVX512-LABEL: load_one_mask_bit_set4:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
ret <4 x double> %res
}
diff --git a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
index 198a96df6b1f..c6ae85dda43a 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
@@ -488,7 +488,7 @@ define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) {
define <4 x float> @stack_fold_extractf32x4(<8 x float> %a0, <8 x float> %a1) {
;CHECK-LABEL: stack_fold_extractf32x4
- ;CHECK: vextractf32x4 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+ ;CHECK: vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
%1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <4 x float> %1
@@ -496,7 +496,7 @@ define <4 x float> @stack_fold_extractf32x4(<8 x float> %a0, <8 x float> %a1) {
define <2 x double> @stack_fold_extractf64x2(<4 x double> %a0, <4 x double> %a1) {
;CHECK-LABEL: stack_fold_extractf64x2
- ;CHECK: vextractf64x2 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+ ;CHECK: vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
%1 = shufflevector <4 x double> %a0, <4 x double> %a1, <2 x i32> <i32 2, i32 3>
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <2 x double> %1
@@ -504,7 +504,7 @@ define <2 x double> @stack_fold_extractf64x2(<4 x double> %a0, <4 x double> %a1)
define <8 x float> @stack_fold_insertf32x4(<4 x float> %a0, <4 x float> %a1) {
;CHECK-LABEL: stack_fold_insertf32x4
- ;CHECK: vinsertf32x4 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vinsertf128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %2
@@ -512,7 +512,7 @@ define <8 x float> @stack_fold_insertf32x4(<4 x float> %a0, <4 x float> %a1) {
define <4 x double> @stack_fold_insertf64x2(<2 x double> %a0, <2 x double> %a1) {
;CHECK-LABEL: stack_fold_insertf64x2
- ;CHECK: vinsertf64x2 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vinsertf128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <2 x double> %a0, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x double> %2
diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
index 6847595e9278..77afc49b2576 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
@@ -445,7 +445,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16
define <4 x i32> @stack_fold_extracti32x4(<8 x i32> %a0, <8 x i32> %a1) {
;CHECK-LABEL: stack_fold_extracti32x4
- ;CHECK: vextracti32x4 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+ ;CHECK: vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
; add forces execution domain
%1 = add <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -455,7 +455,7 @@ define <4 x i32> @stack_fold_extracti32x4(<8 x i32> %a0, <8 x i32> %a1) {
define <2 x i64> @stack_fold_extracti64x2(<4 x i64> %a0, <4 x i64> %a1) {
;CHECK-LABEL: stack_fold_extracti64x2
- ;CHECK: vextracti64x2 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+ ;CHECK: vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
; add forces execution domain
%1 = add <4 x i64> %a0, <i64 1, i64 1, i64 1, i64 1>
%2 = shufflevector <4 x i64> %1, <4 x i64> %a1, <2 x i32> <i32 2, i32 3>
@@ -465,7 +465,7 @@ define <2 x i64> @stack_fold_extracti64x2(<4 x i64> %a0, <4 x i64> %a1) {
define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK-LABEL: stack_fold_inserti32x4
- ;CHECK: vinserti32x4 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; add forces execution domain
@@ -475,7 +475,7 @@ define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i64> @stack_fold_inserti64x2(<2 x i64> %a0, <2 x i64> %a1) {
;CHECK-LABEL: stack_fold_inserti64x2
- ;CHECK: vinserti64x2 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ;CHECK: vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; add forces execution domain
diff --git a/test/CodeGen/X86/subvector-broadcast.ll b/test/CodeGen/X86/subvector-broadcast.ll
index b4f7cb9e106d..7aa3f393bbed 100644
--- a/test/CodeGen/X86/subvector-broadcast.ll
+++ b/test/CodeGen/X86/subvector-broadcast.ll
@@ -832,7 +832,7 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0
; X32-AVX512F-NEXT: vmovaps %xmm0, (%eax)
-; X32-AVX512F-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse:
@@ -841,7 +841,7 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512BW-NEXT: vmovaps (%ecx), %xmm0
; X32-AVX512BW-NEXT: vmovaps %xmm0, (%eax)
-; X32-AVX512BW-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse:
@@ -850,7 +850,7 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512DQ-NEXT: vmovapd (%ecx), %xmm0
; X32-AVX512DQ-NEXT: vmovapd %xmm0, (%eax)
-; X32-AVX512DQ-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512DQ-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
@@ -864,21 +864,21 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub
; X64-AVX512F: ## BB#0:
; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX512F-NEXT: vmovaps %xmm0, (%rsi)
-; X64-AVX512F-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse:
; X64-AVX512BW: ## BB#0:
; X64-AVX512BW-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
-; X64-AVX512BW-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse:
; X64-AVX512DQ: ## BB#0:
; X64-AVX512DQ-NEXT: vmovapd (%rdi), %xmm0
; X64-AVX512DQ-NEXT: vmovapd %xmm0, (%rsi)
-; X64-AVX512DQ-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512DQ-NEXT: retq
%1 = load <2 x double>, <2 x double>* %p0
store <2 x double> %1, <2 x double>* %p1
@@ -896,32 +896,14 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1)
; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX-NEXT: retl
;
-; X32-AVX512F-LABEL: test_broadcast_2i64_4i64_reuse:
-; X32-AVX512F: ## BB#0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
-; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_2i64_4i64_reuse:
-; X32-AVX512BW: ## BB#0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0
-; X32-AVX512BW-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_2i64_4i64_reuse:
-; X32-AVX512DQ: ## BB#0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
-; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512DQ-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
+; X32-AVX512: ## BB#0:
+; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-AVX512-NEXT: vmovdqa (%ecx), %xmm0
+; X32-AVX512-NEXT: vmovdqa %xmm0, (%eax)
+; X32-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
; X64-AVX: ## BB#0:
@@ -930,26 +912,12 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1)
; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX-NEXT: retq
;
-; X64-AVX512F-LABEL: test_broadcast_2i64_4i64_reuse:
-; X64-AVX512F: ## BB#0:
-; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_2i64_4i64_reuse:
-; X64-AVX512BW: ## BB#0:
-; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; X64-AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: test_broadcast_2i64_4i64_reuse:
-; X64-AVX512DQ: ## BB#0:
-; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
-; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512DQ-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
+; X64-AVX512: ## BB#0:
+; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
+; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512-NEXT: retq
%1 = load <2 x i64>, <2 x i64>* %p0
store <2 x i64> %1, <2 x i64>* %p1
%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -957,37 +925,21 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1)
}
define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) {
-; X32-AVX-LABEL: test_broadcast_4f32_8f32_reuse:
-; X32-AVX: ## BB#0:
-; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX-NEXT: vmovaps (%ecx), %xmm0
-; X32-AVX-NEXT: vmovaps %xmm0, (%eax)
-; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX-NEXT: retl
-;
-; X32-AVX512-LABEL: test_broadcast_4f32_8f32_reuse:
-; X32-AVX512: ## BB#0:
-; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512-NEXT: vmovaps (%ecx), %xmm0
-; X32-AVX512-NEXT: vmovaps %xmm0, (%eax)
-; X32-AVX512-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512-NEXT: retl
-;
-; X64-AVX-LABEL: test_broadcast_4f32_8f32_reuse:
-; X64-AVX: ## BB#0:
-; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
-; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
-; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX-NEXT: retq
-;
-; X64-AVX512-LABEL: test_broadcast_4f32_8f32_reuse:
-; X64-AVX512: ## BB#0:
-; X64-AVX512-NEXT: vmovaps (%rdi), %xmm0
-; X64-AVX512-NEXT: vmovaps %xmm0, (%rsi)
-; X64-AVX512-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512-NEXT: retq
+; X32-LABEL: test_broadcast_4f32_8f32_reuse:
+; X32: ## BB#0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovaps (%ecx), %xmm0
+; X32-NEXT: vmovaps %xmm0, (%eax)
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_broadcast_4f32_8f32_reuse:
+; X64: ## BB#0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vmovaps %xmm0, (%rsi)
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
%1 = load <4 x float>, <4 x float>* %p0
store <4 x float> %1, <4 x float>* %p1
%2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -1010,7 +962,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1)
; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
@@ -1024,7 +976,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1)
; X64-AVX512: ## BB#0:
; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %p0
store <4 x i32> %1, <4 x i32>* %p1
@@ -1048,7 +1000,7 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse:
@@ -1057,7 +1009,7 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0
; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax)
-; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse:
@@ -1066,7 +1018,7 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512DQ-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
@@ -1080,21 +1032,21 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p
; X64-AVX512F: ## BB#0:
; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse:
; X64-AVX512BW: ## BB#0:
; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi)
-; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse:
; X64-AVX512DQ: ## BB#0:
; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512DQ-NEXT: retq
%1 = load <8 x i16>, <8 x i16> *%p0
store <8 x i16> %1, <8 x i16>* %p1
@@ -1118,7 +1070,7 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1)
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse:
@@ -1127,7 +1079,7 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1)
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0
; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax)
-; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse:
@@ -1136,7 +1088,7 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1)
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512DQ-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
@@ -1150,21 +1102,21 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1)
; X64-AVX512F: ## BB#0:
; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse:
; X64-AVX512BW: ## BB#0:
; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi)
-; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse:
; X64-AVX512DQ: ## BB#0:
; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512DQ-NEXT: retq
%1 = load <16 x i8>, <16 x i8> *%p0
store <16 x i8> %1, <16 x i8>* %p1
@@ -1194,7 +1146,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p
; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax)
-; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain:
@@ -1204,7 +1156,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p
; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax)
-; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain:
@@ -1214,7 +1166,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p
; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax)
-; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512DQ-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
@@ -1230,7 +1182,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p
; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi)
-; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain:
@@ -1238,7 +1190,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p
; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
-; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain:
@@ -1246,7 +1198,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p
; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi)
-; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512DQ-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %p0
store <4 x float> zeroinitializer, <4 x float>* %p1
@@ -1349,6 +1301,44 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>*
@gb4 = global <8 x i64> zeroinitializer, align 8
define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
+; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
+; X32-AVX1: ## BB#0: ## %entry
+; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X32-AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,2,0,3,0,4,0]
+; X32-AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X32-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3
+; X32-AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1
+; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; X32-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
+; X32-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
+; X32-AVX1-NEXT: vmovups %ymm0, _ga4
+; X32-AVX1-NEXT: vmovups %ymm2, _gb4+32
+; X32-AVX1-NEXT: vmovups %ymm1, _gb4
+; X32-AVX1-NEXT: vzeroupper
+; X32-AVX1-NEXT: retl
+;
+; X32-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
+; X32-AVX2: ## BB#0: ## %entry
+; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
+; X32-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; X32-AVX2-NEXT: vmovdqu %ymm0, _ga4
+; X32-AVX2-NEXT: vmovdqu %ymm2, _gb4+32
+; X32-AVX2-NEXT: vmovdqu %ymm1, _gb4
+; X32-AVX2-NEXT: vzeroupper
+; X32-AVX2-NEXT: retl
+;
; X32-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
; X32-AVX512: ## BB#0: ## %entry
; X32-AVX512-NEXT: vpaddq LCPI26_0, %ymm0, %ymm0
@@ -1359,6 +1349,45 @@ define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
; X32-AVX512-NEXT: vmovdqu64 %zmm1, _gb4
; X32-AVX512-NEXT: retl
;
+; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
+; X64-AVX1: ## BB#0: ## %entry
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,4]
+; X64-AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2]
+; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [1,2,3,4]
+; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm6
+; X64-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4
+; X64-AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; X64-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; X64-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vmovups %ymm0, {{.*}}(%rip)
+; X64-AVX1-NEXT: vmovups %ymm2, _gb4+{{.*}}(%rip)
+; X64-AVX1-NEXT: vmovups %ymm1, {{.*}}(%rip)
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
+; X64-AVX2: ## BB#0: ## %entry
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
+; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; X64-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
+; X64-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vmovdqu %ymm0, {{.*}}(%rip)
+; X64-AVX2-NEXT: vmovdqu %ymm2, _gb4+{{.*}}(%rip)
+; X64-AVX2-NEXT: vmovdqu %ymm1, {{.*}}(%rip)
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
; X64-AVX512: ## BB#0: ## %entry
; X64-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4]
@@ -1383,6 +1412,20 @@ entry:
@gb2 = global <8 x double> zeroinitializer, align 8
define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
+; X32-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
+; X32-AVX: ## BB#0: ## %entry
+; X32-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; X32-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0
+; X32-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2
+; X32-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1
+; X32-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1
+; X32-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2
+; X32-AVX-NEXT: vmovupd %ymm0, _ga2
+; X32-AVX-NEXT: vmovupd %ymm2, _gb2+32
+; X32-AVX-NEXT: vmovupd %ymm1, _gb2
+; X32-AVX-NEXT: vzeroupper
+; X32-AVX-NEXT: retl
+;
; X32-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
; X32-AVX512: ## BB#0: ## %entry
; X32-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
@@ -1394,6 +1437,20 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b)
; X32-AVX512-NEXT: vmovupd %zmm1, _gb2
; X32-AVX512-NEXT: retl
;
+; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
+; X64-AVX: ## BB#0: ## %entry
+; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0
+; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2
+; X64-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1
+; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1
+; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2
+; X64-AVX-NEXT: vmovupd %ymm0, {{.*}}(%rip)
+; X64-AVX-NEXT: vmovupd %ymm2, _gb2+{{.*}}(%rip)
+; X64-AVX-NEXT: vmovupd %ymm1, {{.*}}(%rip)
+; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: retq
+;
; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
; X64-AVX512: ## BB#0: ## %entry
; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index 2ced6de6aebe..2ad20a89cf26 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -204,7 +204,7 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
;
; AVX512VL-LABEL: fptosi_4f64_to_4i64:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm2
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -217,7 +217,7 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptosi_4f64_to_4i64:
@@ -719,7 +719,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
;
; AVX512VL-LABEL: fptoui_4f64_to_4i64:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm2
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -732,7 +732,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_4f64_to_4i64:
@@ -1097,7 +1097,7 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptosi_4f32_to_4i64:
@@ -1205,7 +1205,7 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
; AVX512VL-NEXT: vmovq %rcx, %xmm1
; AVX512VL-NEXT: vmovq %rax, %xmm2
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptosi_8f32_to_4i64:
@@ -1822,7 +1822,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm0
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_4f32_to_4i64:
@@ -2000,7 +2000,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
; AVX512VL-NEXT: vmovq %rcx, %xmm1
; AVX512VL-NEXT: vmovq %rax, %xmm2
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_8f32_to_4i64:
@@ -2409,125 +2409,29 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
; SSE-NEXT: popq %r14
; SSE-NEXT: retq
;
-; VEX-LABEL: fptosi_2f128_to_4i32:
-; VEX: # BB#0:
-; VEX-NEXT: pushq %r14
-; VEX-NEXT: pushq %rbx
-; VEX-NEXT: subq $24, %rsp
-; VEX-NEXT: movq %rsi, %r14
-; VEX-NEXT: movq %rdi, %rbx
-; VEX-NEXT: movq %rdx, %rdi
-; VEX-NEXT: movq %rcx, %rsi
-; VEX-NEXT: callq __fixtfdi
-; VEX-NEXT: vmovq %rax, %xmm0
-; VEX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; VEX-NEXT: movq %rbx, %rdi
-; VEX-NEXT: movq %r14, %rsi
-; VEX-NEXT: callq __fixtfdi
-; VEX-NEXT: vmovq %rax, %xmm0
-; VEX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; VEX-NEXT: # xmm0 = xmm0[0],mem[0]
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; VEX-NEXT: addq $24, %rsp
-; VEX-NEXT: popq %rbx
-; VEX-NEXT: popq %r14
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptosi_2f128_to_4i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: subq $24, %rsp
-; AVX512F-NEXT: movq %rsi, %r14
-; AVX512F-NEXT: movq %rdi, %rbx
-; AVX512F-NEXT: movq %rdx, %rdi
-; AVX512F-NEXT: movq %rcx, %rsi
-; AVX512F-NEXT: callq __fixtfdi
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512F-NEXT: movq %rbx, %rdi
-; AVX512F-NEXT: movq %r14, %rsi
-; AVX512F-NEXT: callq __fixtfdi
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512F-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512F-NEXT: addq $24, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_2f128_to_4i32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: subq $24, %rsp
-; AVX512VL-NEXT: movq %rsi, %r14
-; AVX512VL-NEXT: movq %rdi, %rbx
-; AVX512VL-NEXT: movq %rdx, %rdi
-; AVX512VL-NEXT: movq %rcx, %rsi
-; AVX512VL-NEXT: callq __fixtfdi
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: movq %rbx, %rdi
-; AVX512VL-NEXT: movq %r14, %rsi
-; AVX512VL-NEXT: callq __fixtfdi
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512VL-NEXT: addq $24, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_2f128_to_4i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: pushq %r14
-; AVX512DQ-NEXT: pushq %rbx
-; AVX512DQ-NEXT: subq $24, %rsp
-; AVX512DQ-NEXT: movq %rsi, %r14
-; AVX512DQ-NEXT: movq %rdi, %rbx
-; AVX512DQ-NEXT: movq %rdx, %rdi
-; AVX512DQ-NEXT: movq %rcx, %rsi
-; AVX512DQ-NEXT: callq __fixtfdi
-; AVX512DQ-NEXT: vmovq %rax, %xmm0
-; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512DQ-NEXT: movq %rbx, %rdi
-; AVX512DQ-NEXT: movq %r14, %rsi
-; AVX512DQ-NEXT: callq __fixtfdi
-; AVX512DQ-NEXT: vmovq %rax, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512DQ-NEXT: addq $24, %rsp
-; AVX512DQ-NEXT: popq %rbx
-; AVX512DQ-NEXT: popq %r14
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f128_to_4i32:
-; AVX512VLDQ: # BB#0:
-; AVX512VLDQ-NEXT: pushq %r14
-; AVX512VLDQ-NEXT: pushq %rbx
-; AVX512VLDQ-NEXT: subq $24, %rsp
-; AVX512VLDQ-NEXT: movq %rsi, %r14
-; AVX512VLDQ-NEXT: movq %rdi, %rbx
-; AVX512VLDQ-NEXT: movq %rdx, %rdi
-; AVX512VLDQ-NEXT: movq %rcx, %rsi
-; AVX512VLDQ-NEXT: callq __fixtfdi
-; AVX512VLDQ-NEXT: vmovq %rax, %xmm0
-; AVX512VLDQ-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX512VLDQ-NEXT: movq %rbx, %rdi
-; AVX512VLDQ-NEXT: movq %r14, %rsi
-; AVX512VLDQ-NEXT: callq __fixtfdi
-; AVX512VLDQ-NEXT: vmovq %rax, %xmm0
-; AVX512VLDQ-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512VLDQ-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX512VLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512VLDQ-NEXT: addq $24, %rsp
-; AVX512VLDQ-NEXT: popq %rbx
-; AVX512VLDQ-NEXT: popq %r14
-; AVX512VLDQ-NEXT: retq
+; AVX-LABEL: fptosi_2f128_to_4i32:
+; AVX: # BB#0:
+; AVX-NEXT: pushq %r14
+; AVX-NEXT: pushq %rbx
+; AVX-NEXT: subq $24, %rsp
+; AVX-NEXT: movq %rsi, %r14
+; AVX-NEXT: movq %rdi, %rbx
+; AVX-NEXT: movq %rdx, %rdi
+; AVX-NEXT: movq %rcx, %rsi
+; AVX-NEXT: callq __fixtfdi
+; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: movq %rbx, %rdi
+; AVX-NEXT: movq %r14, %rsi
+; AVX-NEXT: callq __fixtfdi
+; AVX-NEXT: vmovq %rax, %xmm0
+; AVX-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX-NEXT: addq $24, %rsp
+; AVX-NEXT: popq %rbx
+; AVX-NEXT: popq %r14
+; AVX-NEXT: retq
%cvt = fptosi <2 x fp128> %a to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i32> %ext
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 58d7f7bf3d83..6a81cdc490fe 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -288,7 +288,7 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
;
; AVX512VL-LABEL: sitofp_4i64_to_4f64:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX512VL-NEXT: vmovq %xmm1, %rax
@@ -299,7 +299,7 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_4i64_to_4f64:
@@ -821,7 +821,7 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
;
; AVX512VL-LABEL: uitofp_4i64_to_4f64:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
; AVX512VL-NEXT: vmovq %xmm1, %rax
@@ -832,7 +832,7 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i64_to_4f64:
@@ -1430,7 +1430,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
@@ -2344,7 +2344,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
@@ -2775,7 +2775,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; AVX512VL-LABEL: sitofp_load_4i64_to_4f64:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX512VL-NEXT: vmovq %xmm1, %rax
@@ -2786,7 +2786,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64:
@@ -3190,7 +3190,7 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; AVX512VL-LABEL: uitofp_load_4i64_to_4f64:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
; AVX512VL-NEXT: vmovq %xmm1, %rax
@@ -3201,7 +3201,7 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64:
@@ -3426,7 +3426,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
@@ -3667,7 +3667,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32:
@@ -4013,7 +4013,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
@@ -4593,7 +4593,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32:
diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll
index 31eb2202a05e..5bf6fbeb6235 100644
--- a/test/CodeGen/X86/vector-half-conversions.ll
+++ b/test/CodeGen/X86/vector-half-conversions.ll
@@ -461,7 +461,7 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = fpext <8 x half> %1 to <8 x float>
@@ -757,7 +757,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
;
; AVX512VL-LABEL: cvt_16i16_to_16f32:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm10
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm10
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: movq %rax, %rcx
; AVX512VL-NEXT: shrq $48, %rcx
@@ -840,14 +840,14 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm7[0],xmm5[0],xmm7[2,3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[2,3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512VL-NEXT: retq
%1 = bitcast <16 x i16> %a0 to <16 x half>
@@ -1227,7 +1227,7 @@ define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = bitcast <8 x i16> %1 to <8 x half>
@@ -1491,14 +1491,14 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512VL-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %a0
@@ -1738,7 +1738,7 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%1 = bitcast <4 x i16> %a0 to <4 x half>
%2 = fpext <4 x half> %1 to <4 x double>
@@ -1929,7 +1929,7 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = bitcast <4 x i16> %1 to <4 x half>
@@ -2145,14 +2145,14 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
; AVX512VL-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
@@ -2350,7 +2350,7 @@ define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
%2 = bitcast <4 x i16> %1 to <4 x half>
@@ -2474,7 +2474,7 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2643,14 +2643,14 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
; AVX512VL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
@@ -3182,7 +3182,7 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
; AVX512VL-NEXT: orl %edx, %eax
; AVX512VL-NEXT: shlq $32, %rax
; AVX512VL-NEXT: orq %rcx, %rax
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512VL-NEXT: vmovd %xmm1, %ecx
@@ -3427,7 +3427,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
@@ -3458,7 +3458,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
@@ -3479,7 +3479,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
; AVX512VL-NEXT: vmovd %xmm0, %eax
; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%1 = fptrunc <16 x float> %a0 to <16 x half>
%2 = bitcast <16 x half> %1 to <16 x i16>
@@ -3958,7 +3958,7 @@ define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512VL-NEXT: vmovd %xmm1, %r10d
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
; AVX512VL-NEXT: vmovd %xmm2, %r11d
@@ -4191,9 +4191,9 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin
;
; AVX512VL-LABEL: store_cvt_16f32_to_16i16:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm2, %xmm3
+; AVX512VL-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm4
; AVX512VL-NEXT: vmovd %xmm4, %eax
; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm4
@@ -4422,7 +4422,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; AVX512VL-NEXT: movzwl %ax, %r14d
; AVX512VL-NEXT: orl %ebx, %r14d
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: callq __truncdfhf2
@@ -4572,7 +4572,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX512VL-NEXT: movzwl %ax, %r14d
; AVX512VL-NEXT: orl %ebx, %r14d
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: callq __truncdfhf2
@@ -4726,7 +4726,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX512VL-NEXT: movzwl %ax, %r14d
; AVX512VL-NEXT: orl %ebx, %r14d
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: callq __truncdfhf2
@@ -4969,7 +4969,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX512VL-NEXT: movzwl %ax, %r15d
; AVX512VL-NEXT: orl %ebx, %r15d
; AVX512VL-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: callq __truncdfhf2
@@ -4994,7 +4994,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX512VL-NEXT: movzwl %ax, %r15d
; AVX512VL-NEXT: orl %ebx, %r15d
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: callq __truncdfhf2
@@ -5188,7 +5188,7 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %r14d
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: callq __truncdfhf2
@@ -5357,7 +5357,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
; AVX512VL-NEXT: movzwl %ax, %ebx
; AVX512VL-NEXT: orl %ebp, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: callq __truncdfhf2
@@ -5528,7 +5528,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
; AVX512VL-NEXT: movzwl %ax, %ebx
; AVX512VL-NEXT: orl %ebp, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: callq __truncdfhf2
@@ -5775,7 +5775,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: callq __truncdfhf2
@@ -5787,7 +5787,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movl %eax, %r12d
; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: callq __truncdfhf2
diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll
index 3ad13e03dbde..c68395493023 100644
--- a/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -710,35 +710,20 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
-; AVX512VLCD-LABEL: testv32i8:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1
-; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512VLCD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
-; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
-; AVX512VLCD-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: testv32i8:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
-; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
-; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: testv32i8:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512-NEXT: vplzcntd %zmm1, %zmm1
+; AVX512-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
;
; X32-AVX-LABEL: testv32i8:
; X32-AVX: # BB#0:
@@ -799,35 +784,20 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
-; AVX512VLCD-LABEL: testv32i8u:
-; AVX512VLCD: ## BB#0:
-; AVX512VLCD-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VLCD-NEXT: vplzcntd %zmm1, %zmm1
-; AVX512VLCD-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512VLCD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
-; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX512VLCD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VLCD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512VLCD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VLCD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
-; AVX512VLCD-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VLCD-NEXT: retq
-;
-; AVX512CD-LABEL: testv32i8u:
-; AVX512CD: ## BB#0:
-; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
-; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
-; AVX512CD-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512CD-NEXT: vpsubb %xmm2, %xmm0, %xmm0
-; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512CD-NEXT: retq
+; AVX512-LABEL: testv32i8u:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512-NEXT: vplzcntd %zmm1, %zmm1
+; AVX512-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
+; AVX512-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpsubb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
;
; X32-AVX-LABEL: testv32i8u:
; X32-AVX: # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index ba47740cbaf0..3c7fd8b51a02 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -1789,25 +1789,15 @@ define <16 x i16> @shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7]
+; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 11, i32 8, i32 9, i32 8, i32 9, i32 10, i32 11, i32 10, i32 11>
ret <16 x i16> %shuffle
}
@@ -1822,23 +1812,14 @@ define <16 x i16> @shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 9, i32 14, i32 15, i32 12, i32 13, i32 10, i32 11, i32 8, i32 9>
ret <16 x i16> %shuffle
}
@@ -1885,23 +1866,14 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
-; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
-; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
+; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
}
@@ -1919,29 +1891,17 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 12, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
@@ -1957,25 +1917,15 @@ define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 11, i32 undef, i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11>
ret <16 x i16> %shuffle
}
@@ -1991,25 +1941,15 @@ define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
+; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 15, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15>
ret <16 x i16> %shuffle
}
@@ -2026,27 +1966,16 @@ define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
+; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 13, i32 11, i32 9, i32 10, i32 8, i32 14, i32 15, i32 12, i32 13>
ret <16 x i16> %shuffle
}
@@ -2062,25 +1991,15 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 8, i32 12, i32 12, i32 12, i32 12, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
}
@@ -2095,23 +2014,14 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 13, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
ret <16 x i16> %shuffle
}
@@ -2128,27 +2038,16 @@ define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
+; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
+; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7]
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 13, i32 10, i32 11, i32 8, i32 10, i32 14, i32 15, i32 12, i32 13>
ret <16 x i16> %shuffle
}
@@ -2164,25 +2063,15 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3]
+; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 15, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 15>
ret <16 x i16> %shuffle
}
@@ -2210,12 +2099,12 @@ define <16 x i16> @shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_0
;
; AVX512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 8, i32 15, i32 13, i32 14, i32 12, i32 11, i32 9, i32 10, i32 8>
ret <16 x i16> %shuffle
@@ -2232,25 +2121,15 @@ define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 8, i32 9, i32 8, i32 13, i32 12, i32 13, i32 12, i32 9, i32 8>
ret <16 x i16> %shuffle
}
@@ -2266,25 +2145,15 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 8, i32 13, i32 12, i32 9, i32 8, i32 13, i32 12, i32 9, i32 8>
ret <16 x i16> %shuffle
}
@@ -2300,25 +2169,15 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 12, i32 13, i32 12, i32 9, i32 8, i32 9, i32 8, i32 13, i32 12>
ret <16 x i16> %shuffle
}
@@ -2334,25 +2193,15 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 8>
ret <16 x i16> %shuffle
}
@@ -2368,25 +2217,15 @@ define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 12>
ret <16 x i16> %shuffle
}
@@ -2414,12 +2253,12 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_1
;
; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 11, i32 10, i32 14, i32 12, i32 8, i32 13, i32 9, i32 15, i32 11>
ret <16 x i16> %shuffle
@@ -2448,12 +2287,12 @@ define <16 x i16> @shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_1
;
; AVX512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 11, i32 10, i32 8, i32 14, i32 12, i32 13, i32 9, i32 15, i32 11>
ret <16 x i16> %shuffle
@@ -2482,12 +2321,12 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_1
;
; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 13, i32 10, i32 14, i32 12, i32 8, i32 9, i32 11, i32 15, i32 13>
ret <16 x i16> %shuffle
@@ -2516,12 +2355,12 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1
;
; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 11, i32 14, i32 14, i32 15, i32 13, i32 9, i32 14, i32 12, i32 11>
ret <16 x i16> %shuffle
@@ -2538,25 +2377,15 @@ define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 12, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
@@ -2572,25 +2401,15 @@ define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 12, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
@@ -2606,25 +2425,15 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 12, i32 8, i32 12, i32 12, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
@@ -2640,25 +2449,15 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
}
@@ -2675,27 +2474,16 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 15, i32 8, i32 12, i32 12, i32 8, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %shuffle
}
@@ -2711,25 +2499,15 @@ define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 12, i32 8, i32 undef, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
@@ -2745,25 +2523,15 @@ define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 12, i32 12, i32 12, i32 undef, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
@@ -2779,25 +2547,15 @@ define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 12, i32 undef, i32 12, i32 12, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
@@ -2848,13 +2606,13 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_1
;
; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm2
; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11>
ret <16 x i16> %shuffle
@@ -2926,7 +2684,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_1
;
; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
@@ -2934,7 +2692,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_1
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,1,2]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 11, i32 8, i32 9, i32 10, i32 15, i32 12, i32 13, i32 14, i32 11>
ret <16 x i16> %shuffle
@@ -2961,13 +2719,13 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_1
;
; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3]
; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,3,1]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 15, i32 12, i32 13, i32 14, i32 11, i32 8, i32 9, i32 10, i32 15>
ret <16 x i16> %shuffle
@@ -2996,12 +2754,12 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1
;
; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 13, i32 11, i32 15, i32 9, i32 8, i32 10, i32 15, i32 11, i32 13>
ret <16 x i16> %shuffle
@@ -3693,23 +3451,14 @@ define <16 x i16> @shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
-; AVX512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
-; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
+; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12>
ret <16 x i16> %shuffle
}
@@ -3809,23 +3558,14 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
-; AVX512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
+; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10>
ret <16 x i16> %shuffle
}
@@ -4000,17 +3740,11 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16>
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19>
ret <16 x i16> %shuffle
}
@@ -4023,19 +3757,12 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a,
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
ret <16 x i16> %shuffle
}
@@ -4049,17 +3776,11 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a,
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
}
@@ -4091,19 +3812,12 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i16> %shuffle
}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index d4ec55a85d8d..301e8079a5dc 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -741,17 +741,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2OR512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
ret <32 x i8> %shuffle
}
@@ -1167,19 +1161,12 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2OR512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <32 x i8> %shuffle
}
@@ -1706,17 +1693,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
-; AVX2: # BB#0:
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>
ret <32 x i8> %shuffle
}
@@ -1787,19 +1768,12 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>
ret <32 x i8> %shuffle
}
@@ -2188,7 +2162,7 @@ define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
ret <32 x i8> %shuffle
@@ -2203,17 +2177,11 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
ret <32 x i8> %shuffle
}
@@ -2277,7 +2245,7 @@ define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_
;
; AVX512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 3ecfc29d0f01..7f978138719e 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -320,39 +320,19 @@ define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) {
}
define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) {
-; AVX1-LABEL: shuffle_v4f64_0145:
-; AVX1: # BB#0:
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v4f64_0145:
-; AVX2: # BB#0:
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4f64_0145:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v4f64_0145:
+; ALL: # BB#0:
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x double> %shuffle
}
define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) {
-; AVX1-LABEL: shuffle_v4f64_4501:
-; AVX1: # BB#0:
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v4f64_4501:
-; AVX2: # BB#0:
-; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4f64_4501:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v4f64_4501:
+; ALL: # BB#0:
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
ret <4 x double> %shuffle
}
@@ -367,23 +347,11 @@ define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) {
}
define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) {
-; AVX1-LABEL: shuffle_v4f64_1054:
-; AVX1: # BB#0:
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v4f64_1054:
-; AVX2: # BB#0:
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4f64_1054:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v4f64_1054:
+; ALL: # BB#0:
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
ret <4 x double> %shuffle
}
@@ -735,7 +703,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_0142:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX512VL-NEXT: retq
@@ -808,7 +776,7 @@ define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_0145:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x i64> %shuffle
@@ -852,7 +820,7 @@ define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_4501:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
ret <4 x i64> %shuffle
@@ -948,7 +916,7 @@ define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) {
;
; AVX512VL-LABEL: shuffle_v4i64_1054:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
@@ -1424,7 +1392,7 @@ define <4 x i64> @concat_v4i64_0145_bc(<4 x i64> %a0, <4 x i64> %a1) {
;
; AVX512VL-LABEL: concat_v4i64_0145_bc:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 0, i32 1>
%a1lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 4, i32 5>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index d6e91ca25d75..cba15827d32c 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -753,17 +753,11 @@ define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
}
define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) {
-; AVX1OR2-LABEL: shuffle_v8f32_3210ba98:
-; AVX1OR2: # BB#0:
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8f32_3210ba98:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_3210ba98:
+; ALL: # BB#0:
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
ret <8 x float> %shuffle
}
@@ -829,17 +823,11 @@ define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
}
define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) {
-; AVX1OR2-LABEL: shuffle_v8f32_ba983210:
-; AVX1OR2: # BB#0:
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8f32_ba983210:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinsertf64x2 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_ba983210:
+; ALL: # BB#0:
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
}
@@ -863,17 +851,11 @@ define <8 x float> @shuffle_v8f32_a2u3e6f7(<8 x float> %a, <8 x float> %b) {
}
define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) {
-; AVX1OR2-LABEL: shuffle_v8f32_uuuu1111:
-; AVX1OR2: # BB#0:
-; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8f32_uuuu1111:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_uuuu1111:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>
ret <8 x float> %shuffle
}
@@ -885,17 +867,11 @@ define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8f32_44444444:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8f32_44444444:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vbroadcastss %xmm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8f32_44444444:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <8 x float> %shuffle
}
@@ -910,33 +886,21 @@ define <8 x float> @shuffle_v8f32_1188uuuu(<8 x float> %a, <8 x float> %b) {
}
define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) {
-; AVX1OR2-LABEL: shuffle_v8f32_uuuu3210:
-; AVX1OR2: # BB#0:
-; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8f32_uuuu3210:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_uuuu3210:
+; ALL: # BB#0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
}
define <8 x float> @shuffle_v8f32_uuuu1188(<8 x float> %a, <8 x float> %b) {
-; AVX1OR2-LABEL: shuffle_v8f32_uuuu1188:
-; AVX1OR2: # BB#0:
-; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8f32_uuuu1188:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_uuuu1188:
+; ALL: # BB#0:
+; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 8, i32 8>
ret <8 x float> %shuffle
}
@@ -951,17 +915,11 @@ define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) {
}
define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
-; AVX1OR2-LABEL: shuffle_v8f32_5555uuuu:
-; AVX1OR2: # BB#0:
-; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8f32_5555uuuu:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextractf32x4 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8f32_5555uuuu:
+; ALL: # BB#0:
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x float> %shuffle
}
@@ -1041,17 +999,11 @@ define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_00040000:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_00040000:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_00040000:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shuffle
}
@@ -1064,17 +1016,11 @@ define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_00500000:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_00500000:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_00500000:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shuffle
}
@@ -1087,17 +1033,11 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_06000000:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_06000000:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_06000000:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shuffle
}
@@ -1142,17 +1082,11 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_00112233:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_00112233:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_00112233:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
ret <8 x i32> %shuffle
}
@@ -1556,17 +1490,11 @@ define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_00015444:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_00015444:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_00015444:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
ret <8 x i32> %shuffle
}
@@ -1577,17 +1505,11 @@ define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_00204644:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_00204644:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_00204644:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
ret <8 x i32> %shuffle
}
@@ -1598,17 +1520,11 @@ define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_03004474:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_03004474:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_03004474:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
ret <8 x i32> %shuffle
}
@@ -1619,17 +1535,11 @@ define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_10004444:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_10004444:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_10004444:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i32> %shuffle
}
@@ -1640,17 +1550,11 @@ define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_22006446:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_22006446:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_22006446:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
ret <8 x i32> %shuffle
}
@@ -1661,17 +1565,11 @@ define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_33307474:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_33307474:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_33307474:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
ret <8 x i32> %shuffle
}
@@ -1682,17 +1580,11 @@ define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_32104567:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_32104567:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_32104567:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %shuffle
}
@@ -1703,17 +1595,11 @@ define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_00236744:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_00236744:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_00236744:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
ret <8 x i32> %shuffle
}
@@ -1724,17 +1610,11 @@ define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_00226644:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_00226644:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_00226644:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
ret <8 x i32> %shuffle
}
@@ -1745,17 +1625,11 @@ define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_10324567:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_10324567:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_10324567:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %shuffle
}
@@ -1766,17 +1640,11 @@ define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_11334567:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_11334567:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_11334567:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %shuffle
}
@@ -1787,17 +1655,11 @@ define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_01235467:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_01235467:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_01235467:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x i32> %shuffle
}
@@ -1808,17 +1670,11 @@ define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_01235466:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_01235466:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6]
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_01235466:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6]
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
ret <8 x i32> %shuffle
}
@@ -1829,17 +1685,11 @@ define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_002u6u44:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4>
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_002u6u44:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4>
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_002u6u44:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4>
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
ret <8 x i32> %shuffle
}
@@ -1850,17 +1700,11 @@ define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_00uu66uu:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_00uu66uu:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u>
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_00uu66uu:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u>
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
ret <8 x i32> %shuffle
}
@@ -1871,17 +1715,11 @@ define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_103245uu:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_103245uu:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u>
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_103245uu:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u>
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
ret <8 x i32> %shuffle
}
@@ -1892,17 +1730,11 @@ define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_1133uu67:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7>
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_1133uu67:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7>
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_1133uu67:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7>
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x i32> %shuffle
}
@@ -1913,17 +1745,11 @@ define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_0uu354uu:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_0uu354uu:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u>
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_0uu354uu:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u>
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
ret <8 x i32> %shuffle
}
@@ -1934,17 +1760,11 @@ define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_uuu3uu66:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6>
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_uuu3uu66:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6>
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_uuu3uu66:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6>
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
ret <8 x i32> %shuffle
}
@@ -2038,17 +1858,11 @@ define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_3210ba98:
-; AVX2: # BB#0:
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_3210ba98:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_3210ba98:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
ret <8 x i32> %shuffle
}
@@ -2185,17 +1999,11 @@ define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_uuuu1111:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_uuuu1111:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_uuuu1111:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shuffle
}
@@ -2233,7 +2041,7 @@ define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) {
;
; AVX512VL-LABEL: shuffle_v8i32_44444444:
; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vpbroadcastd %xmm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -2247,17 +2055,11 @@ define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_5555uuuu:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_5555uuuu:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti32x4 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_5555uuuu:
+; AVX2OR512VL: # BB#0:
+; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i32> %shuffle
}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll
index 7f7c27af47b3..b951bf1c97ed 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
target triple = "x86_64-unknown-unknown"
@@ -35,7 +35,7 @@ define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d
define <16 x float> @shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz:
; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; ALL-NEXT: vxorps %zmm1, %zmm1, %zmm1
; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <16 x i32><i32 0, i32 16, i32 1, i32 16, i32 4, i32 16, i32 5, i32 16, i32 8, i32 16, i32 9, i32 16, i32 12, i32 16, i32 13, i32 16>
@@ -82,7 +82,7 @@ define <16 x float> @shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f
define <16 x float> @shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f:
; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; ALL-NEXT: vxorps %zmm0, %zmm0, %zmm0
; ALL-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> zeroinitializer, <16 x float> %b, <16 x i32><i32 0, i32 18, i32 0, i32 19, i32 4, i32 22, i32 4, i32 23, i32 6, i32 26, i32 6, i32 27, i32 8, i32 30, i32 8, i32 31>
@@ -262,8 +262,8 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
; ALL-LABEL: shuffle_v16f32_extract_256:
; ALL: # BB#0:
-; ALL-NEXT: vmovupd (%rsi), %zmm0
-; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vmovups (%rsi), %zmm0
+; ALL-NEXT: vextractf32x8 $1, %zmm0, %ymm0
; ALL-NEXT: retq
%ptr_a = bitcast float* %a to <16 x float>*
%v_a = load <16 x float>, <16 x float>* %ptr_a, align 4
@@ -397,8 +397,8 @@ define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15
ret <16 x i32> %res
}
-define <16 x i32> @mask_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
-; ALL-LABEL: mask_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
+; ALL-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
; ALL: # BB#0:
; ALL-NEXT: kmovw %edi, %k1
; ALL-NEXT: valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
@@ -422,8 +422,8 @@ define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_1
ret <16 x i32> %res
}
-define <16 x i32> @maskz_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; ALL-LABEL: maskz_shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
+define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; ALL-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
; ALL: # BB#0:
; ALL-NEXT: kmovw %edi, %k1
; ALL-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
@@ -495,3 +495,55 @@ define <16 x i32> @test_vshufi32x4_512_mask(<16 x i32> %x, <16 x i32> %x1, <16 x
%res = select <16 x i1> %mask, <16 x i32> %x2, <16 x i32> %y
ret <16 x i32> %res
}
+
+define <16 x float> @mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) {
+; ALL-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
+; ALL: # BB#0:
+; ALL-NEXT: kmovw %edi, %k1
+; ALL-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru
+ ret <16 x float> %res
+}
+
+define <16 x float> @mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) {
+; ALL-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
+; ALL: # BB#0:
+; ALL-NEXT: kmovw %edi, %k1
+; ALL-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru
+ ret <16 x float> %res
+}
+
+define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
+; ALL-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
+; ALL: # BB#0:
+; ALL-NEXT: kmovw %edi, %k1
+; ALL-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; ALL-NEXT: vmovdqa64 %zmm2, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
+; ALL-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
+; ALL: # BB#0:
+; ALL-NEXT: kmovw %edi, %k1
+; ALL-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; ALL-NEXT: vmovdqa64 %zmm2, %zmm0
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
+ ret <16 x i32> %res
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 625681dc294c..365ff3bf63d5 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -2375,3 +2375,199 @@ define <8 x i64> @maskz_shuffle_v8i64_12345670(<8 x i64> %a, i8 %mask) {
%res = select <8 x i1> %mask.cast, <8 x i64> %shuffle, <8 x i64> zeroinitializer
ret <8 x i64> %res
}
+
+define <8 x double> @shuffle_v8f64_012389AB(<8 x double> %a, <8 x double> %b) {
+; AVX512F-LABEL: shuffle_v8f64_012389AB:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_012389AB:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_89AB0123(<8 x double> %a, <8 x double> %b) {
+; AVX512F-LABEL: shuffle_v8f64_89AB0123:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_89AB0123:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_01230123(<8 x double> %a, <8 x double> %b) {
+; AVX512F-LABEL: shuffle_v8f64_01230123:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_01230123:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x double> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_012389AB(<8 x i64> %a, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_012389AB:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_012389AB:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_89AB0123(<8 x i64> %a, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_89AB0123:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_89AB0123:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01230123(<8 x i64> %a, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_01230123:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_01230123:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_89234567(<8 x double> %a, <8 x double> %b) {
+; AVX512F-LABEL: shuffle_v8f64_89234567:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_89234567:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_01894567(<8 x double> %a, <8 x double> %b) {
+; AVX512F-LABEL: shuffle_v8f64_01894567:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_01894567:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_01238967(<8 x double> %a, <8 x double> %b) {
+; AVX512F-LABEL: shuffle_v8f64_01238967:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_01238967:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @shuffle_v8f64_01234589(<8 x double> %a, <8 x double> %b) {
+; AVX512F-LABEL: shuffle_v8f64_01234589:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_01234589:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ ret <8 x double> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_89234567(<8 x i64> %a, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_89234567:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_89234567:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01894567(<8 x i64> %a, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_01894567:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_01894567:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01238967(<8 x i64> %a, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_01238967:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_01238967:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @shuffle_v8i64_01234589(<8 x i64> %a, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_01234589:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vinserti32x4 $3, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_01234589:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vinserti32x4 $3, %xmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+ ret <8 x i64> %shuffle
+}
diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll
index 1dcfd3223c86..f828ed0ba6e7 100644
--- a/test/CodeGen/X86/vector-trunc-math.ll
+++ b/test/CodeGen/X86/vector-trunc-math.ll
@@ -419,40 +419,31 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v4i64_v4i32:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm2
-; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; SSE-NEXT: paddq %xmm2, %xmm0
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
; AVX1: # BB#0:
-; AVX1-NEXT: movl $1, %eax
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v4i64_v4i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
; AVX512: # BB#0:
-; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -462,52 +453,39 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v8i64_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm4
-; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
-; SSE-NEXT: paddq %xmm0, %xmm4
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE-NEXT: paddw {{.*}}(%rip), %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: movl $1, %eax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v8i64_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -515,14 +493,14 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -532,41 +510,38 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v8i32_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -576,17 +551,6 @@ define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v16i64_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm8
-; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
-; SSE-NEXT: paddq %xmm8, %xmm0
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm3
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm4
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm5
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm6
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm7
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
@@ -603,50 +567,37 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: movl $1, %eax
-; AVX1-NEXT: vmovq %rax, %xmm4
-; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
-; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
-; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
-; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
-; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm3, %ymm3
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
@@ -666,37 +617,35 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_add_const_v16i64_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512F-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: retq
%1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -706,10 +655,6 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v16i32_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm2
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
@@ -718,31 +663,27 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -752,13 +693,14 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
; AVX512: # BB#0:
-; AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -768,56 +710,54 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v16i16_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
-; SSE-NEXT: paddw {{.*}}(%rip), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: retq
%1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -1676,69 +1616,39 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX1-LABEL: trunc_mul_v4i64_v4i32:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
-; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
-; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
-; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
-; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
-; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_v4i64_v4i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512F-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX512F-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX512F-NEXT: vpaddq %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsllq $32, %ymm2, %ymm2
-; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2
-; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2
-; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
@@ -1757,46 +1667,17 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_mul_v8i64_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: movdqa %xmm0, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm4, %xmm8
-; SSE-NEXT: movdqa %xmm4, %xmm9
-; SSE-NEXT: psrlq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm0, %xmm9
-; SSE-NEXT: paddq %xmm8, %xmm9
-; SSE-NEXT: psllq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm4, %xmm0
-; SSE-NEXT: paddq %xmm9, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm5, %xmm8
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm1, %xmm4
-; SSE-NEXT: paddq %xmm8, %xmm4
-; SSE-NEXT: psllq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm5, %xmm1
-; SSE-NEXT: paddq %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm6, %xmm4
-; SSE-NEXT: movdqa %xmm6, %xmm5
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm2, %xmm5
-; SSE-NEXT: paddq %xmm4, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm6, %xmm2
-; SSE-NEXT: paddq %xmm5, %xmm2
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm7, %xmm4
-; SSE-NEXT: movdqa %xmm7, %xmm5
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm3, %xmm5
-; SSE-NEXT: paddq %xmm4, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm7, %xmm3
-; SSE-NEXT: paddq %xmm5, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
@@ -1808,111 +1689,68 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: pmullw %xmm6, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_v8i64_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
-; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
-; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5
-; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5
-; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5
-; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6
-; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6
-; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2
-; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
-; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5
-; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5
-; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
-; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
-; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6
-; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
-; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_v8i64_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4
-; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5
-; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5
-; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
-; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
-; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
-; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4
-; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
-; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512F-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512F-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512F-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
@@ -2186,104 +2024,60 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
;
; AVX2-LABEL: trunc_mul_v16i64_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm8
-; AVX2-NEXT: vpmuludq %ymm5, %ymm8, %ymm8
-; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm9
-; AVX2-NEXT: vpmuludq %ymm9, %ymm1, %ymm9
-; AVX2-NEXT: vpaddq %ymm8, %ymm9, %ymm8
-; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8
-; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
-; AVX2-NEXT: vpaddq %ymm8, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm5
-; AVX2-NEXT: vpmuludq %ymm4, %ymm5, %ymm5
-; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm8
-; AVX2-NEXT: vpmuludq %ymm8, %ymm0, %ymm8
-; AVX2-NEXT: vpaddq %ymm5, %ymm8, %ymm5
-; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
-; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm5, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX2-NEXT: vpmuludq %ymm7, %ymm4, %ymm4
-; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm5
-; AVX2-NEXT: vpmuludq %ymm5, %ymm3, %ymm5
-; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
-; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
-; AVX2-NEXT: vpmuludq %ymm6, %ymm4, %ymm4
-; AVX2-NEXT: vpsrlq $32, %ymm6, %ymm5
-; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm5
-; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
-; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpaddq %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vpmulld %xmm7, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpmulld %xmm6, %xmm2, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpmulld %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpmulld %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm4
-; AVX512F-NEXT: vpmuludq %zmm3, %zmm4, %zmm4
-; AVX512F-NEXT: vpsrlq $32, %zmm3, %zmm5
-; AVX512F-NEXT: vpmuludq %zmm5, %zmm1, %zmm5
-; AVX512F-NEXT: vpaddq %zmm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpsllq $32, %zmm4, %zmm4
-; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT: vpaddq %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm3
-; AVX512F-NEXT: vpmuludq %zmm2, %zmm3, %zmm3
-; AVX512F-NEXT: vpsrlq $32, %zmm2, %zmm4
-; AVX512F-NEXT: vpmuludq %zmm4, %zmm0, %zmm4
-; AVX512F-NEXT: vpaddq %zmm3, %zmm4, %zmm3