aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xcmake/config-ix.cmake13
-rwxr-xr-xcmake/modules/AddLLVM.cmake17
-rw-r--r--include/llvm/Analysis/ScalarEvolution.h2
-rw-r--r--include/llvm/Analysis/TargetLibraryInfo.h4
-rw-r--r--include/llvm/CodeGen/MachineBasicBlock.h10
-rw-r--r--include/llvm/CodeGen/MachineFrameInfo.h3
-rw-r--r--include/llvm/DebugInfo/MSF/StreamArray.h111
-rw-r--r--include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h4
-rw-r--r--include/llvm/ExecutionEngine/Orc/RPCUtils.h246
-rw-r--r--include/llvm/ExecutionEngine/Orc/RawByteChannel.h4
-rw-r--r--include/llvm/IR/ModuleSummaryIndexYAML.h12
-rw-r--r--include/llvm/IR/PassManager.h127
-rw-r--r--include/llvm/IR/User.h20
-rw-r--r--include/llvm/Support/Path.h8
-rw-r--r--include/llvm/Transforms/IPO.h13
-rw-r--r--include/llvm/Transforms/IPO/PassManagerBuilder.h1
-rw-r--r--lib/Analysis/InstructionSimplify.cpp20
-rw-r--r--lib/Analysis/LoopInfo.cpp6
-rw-r--r--lib/Analysis/MemoryDependenceAnalysis.cpp42
-rw-r--r--lib/Analysis/ScalarEvolution.cpp12
-rw-r--r--lib/Analysis/ValueTracking.cpp1
-rw-r--r--lib/Bitcode/Reader/MetadataLoader.cpp13
-rw-r--r--lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp4
-rw-r--r--lib/CodeGen/StackSlotColoring.cpp11
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp44
-rw-r--r--lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h3
-rw-r--r--lib/LTO/ThinLTOCodeGenerator.cpp9
-rw-r--r--lib/Object/MachOObjectFile.cpp8
-rw-r--r--lib/Object/ModuleSummaryIndexObjectFile.cpp8
-rw-r--r--lib/Support/CommandLine.cpp2
-rw-r--r--lib/Support/Path.cpp10
-rw-r--r--lib/Support/TarWriter.cpp42
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp4
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp10
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp52
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp281
-rw-r--r--lib/Target/AMDGPU/R600Instructions.td11
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp39
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h3
-rw-r--r--lib/Target/AVR/AVRISelDAGToDAG.cpp4
-rw-r--r--lib/Target/AVR/AVRISelLowering.cpp41
-rw-r--r--lib/Target/AVR/AVRISelLowering.h3
-rw-r--r--lib/Target/BPF/BPFInstrInfo.cpp16
-rw-r--r--lib/Target/BPF/Disassembler/BPFDisassembler.cpp12
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp20
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp11
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp19
-rw-r--r--lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp9
-rw-r--r--lib/Target/TargetMachineC.cpp4
-rw-r--r--lib/Target/WebAssembly/CMakeLists.txt1
-rw-r--r--lib/Target/WebAssembly/WebAssembly.h1
-rw-r--r--lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp159
-rw-r--r--lib/Target/WebAssembly/WebAssemblyInstrInteger.td4
-rw-r--r--lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp4
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp264
-rw-r--r--lib/Target/X86/X86InstrAVX512.td247
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp19
-rw-r--r--lib/Target/X86/X86InstrSSE.td2
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp291
-rw-r--r--lib/Transforms/IPO/LowerTypeTests.cpp109
-rw-r--r--lib/Transforms/IPO/PassManagerBuilder.cpp3
-rw-r--r--lib/Transforms/InstCombine/InstCombineCompares.cpp10
-rw-r--r--lib/Transforms/Instrumentation/AddressSanitizer.cpp1
-rw-r--r--lib/Transforms/Scalar/IndVarSimplify.cpp2
-rw-r--r--lib/Transforms/Scalar/LoopLoadElimination.cpp4
-rw-r--r--lib/Transforms/Scalar/LoopUnswitch.cpp2
-rw-r--r--lib/Transforms/Scalar/NewGVN.cpp192
-rw-r--r--lib/Transforms/Scalar/SCCP.cpp18
-rw-r--r--lib/Transforms/Utils/FunctionImportUtils.cpp15
-rw-r--r--lib/Transforms/Utils/SimplifyLibCalls.cpp12
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp34
-rw-r--r--test/Analysis/CostModel/X86/shuffle-reverse.ll2
-rw-r--r--test/Analysis/CostModel/X86/testshiftlshr.ll4
-rw-r--r--test/Analysis/CostModel/X86/testshiftshl.ll4
-rw-r--r--test/Analysis/CostModel/X86/vshift-ashr-cost.ll45
-rw-r--r--test/Analysis/CostModel/X86/vshift-lshr-cost.ll66
-rw-r--r--test/Analysis/CostModel/X86/vshift-shl-cost.ll70
-rw-r--r--test/Analysis/ScalarEvolution/invalidation.ll70
-rw-r--r--test/Analysis/ValueTracking/assume.ll22
-rw-r--r--test/Bindings/Go/lit.local.cfg2
-rw-r--r--test/Bindings/OCaml/lit.local.cfg2
-rw-r--r--test/CMakeLists.txt14
-rw-r--r--test/CodeGen/AMDGPU/load-constant-i16.ll138
-rw-r--r--test/CodeGen/AMDGPU/load-global-i16.ll331
-rw-r--r--test/CodeGen/AMDGPU/min.ll172
-rw-r--r--test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll16
-rw-r--r--test/CodeGen/AMDGPU/store-private.ll743
-rw-r--r--test/CodeGen/AVR/intrinsics/read_register.ll17
-rw-r--r--test/CodeGen/WebAssembly/function-bitcasts.ll56
-rw-r--r--test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll26
-rw-r--r--test/CodeGen/X86/avx2-arith.ll101
-rw-r--r--test/CodeGen/X86/avx512-bugfix-23634.ll2
-rw-r--r--test/CodeGen/X86/avx512-calling-conv.ll24
-rw-r--r--test/CodeGen/X86/avx512-cvt.ll14
-rw-r--r--test/CodeGen/X86/avx512-ext.ll33
-rw-r--r--test/CodeGen/X86/avx512-insert-extract.ll56
-rw-r--r--test/CodeGen/X86/avx512-mask-op.ll110
-rw-r--r--test/CodeGen/X86/avx512-mov.ll16
-rw-r--r--test/CodeGen/X86/avx512-regcall-NoMask.ll30
-rw-r--r--test/CodeGen/X86/avx512-vbroadcast.ll3
-rw-r--r--test/CodeGen/X86/avx512-vec-cmp.ll141
-rw-r--r--test/CodeGen/X86/avx512bw-mov.ll4
-rw-r--r--test/CodeGen/X86/avx512bw-vec-cmp.ll36
-rw-r--r--test/CodeGen/X86/avx512bwvl-mov.ll8
-rw-r--r--test/CodeGen/X86/avx512bwvl-vec-cmp.ll72
-rw-r--r--test/CodeGen/X86/avx512vl-mov.ll32
-rw-r--r--test/CodeGen/X86/avx512vl-vec-cmp.ll144
-rw-r--r--test/CodeGen/X86/cmov.ll6
-rw-r--r--test/CodeGen/X86/fma-fneg-combine.ll12
-rw-r--r--test/CodeGen/X86/fmaddsub-combine.ll129
-rw-r--r--test/CodeGen/X86/sse-fsignum.ll11
-rw-r--r--test/CodeGen/X86/vector-compare-results.ll6208
-rw-r--r--test/CodeGen/X86/vector-sext.ll45
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-128.ll130
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-256.ll234
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-512.ll52
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-128.ll94
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-256.ll162
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-512.ll52
-rw-r--r--test/CodeGen/X86/vector-shift-shl-128.ll88
-rw-r--r--test/CodeGen/X86/vector-shift-shl-256.ll154
-rw-r--r--test/CodeGen/X86/vector-shift-shl-512.ll27
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v64.ll9
-rw-r--r--test/CodeGen/X86/vector-shuffle-masked.ll33
-rw-r--r--test/CodeGen/X86/vector-shuffle-v1.ll74
-rw-r--r--test/ExecutionEngine/Interpreter/lit.local.cfg2
-rw-r--r--test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_BE-relocations.s11
-rw-r--r--test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_local_branch.s14
-rw-r--r--test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_relocations.s35
-rw-r--r--test/Instrumentation/AddressSanitizer/global_metadata_darwin.ll2
-rw-r--r--test/JitListener/lit.local.cfg2
-rw-r--r--test/ThinLTO/X86/Inputs/funcimport-tbaa.ll11
-rw-r--r--test/ThinLTO/X86/Inputs/local_name_conflict1.ll17
-rw-r--r--test/ThinLTO/X86/Inputs/local_name_conflict2.ll17
-rw-r--r--test/ThinLTO/X86/funcimport-tbaa.ll38
-rw-r--r--test/ThinLTO/X86/local_name_conflict.ll29
-rw-r--r--test/Transforms/GVN/invariant.group.ll52
-rw-r--r--test/Transforms/InstCombine/assume.ll45
-rw-r--r--test/Transforms/InstCombine/assume2.ll141
-rw-r--r--test/Transforms/InstCombine/fabs.ll42
-rw-r--r--test/Transforms/InstCombine/fast-math.ll6
-rw-r--r--test/Transforms/InstCombine/urem-simplify-bug.ll52
-rw-r--r--test/Transforms/InstSimplify/div.ll15
-rw-r--r--test/Transforms/InstSimplify/rem.ll14
-rw-r--r--test/Transforms/LICM/hoisting.ll27
-rw-r--r--test/Transforms/LoopLoadElim/forward.ll6
-rw-r--r--test/Transforms/LoopVectorize/iv_outside_user.ll45
-rw-r--r--test/Transforms/NewGVN/basic-cyclic-opt.ll235
-rw-r--r--test/Transforms/NewGVN/cyclic-phi-handling.ll37
-rw-r--r--test/Transforms/NewGVN/invariant.group.ll52
-rw-r--r--test/Transforms/NewGVN/memory-handling.ll195
-rw-r--r--test/Transforms/NewGVN/pr31501.ll136
-rw-r--r--test/Transforms/NewGVN/pr31573.ll42
-rw-r--r--test/lit.cfg10
-rw-r--r--test/lit.site.cfg.in18
-rw-r--r--test/tools/llvm-config/system-libs.test3
-rw-r--r--test/tools/llvm-config/system-libs.windows.test3
-rw-r--r--test/tools/llvm-opt-report/Inputs/dm.c13
-rw-r--r--test/tools/llvm-opt-report/Inputs/dm.yaml104
-rw-r--r--test/tools/llvm-opt-report/func-dm.test17
-rw-r--r--tools/llvm-config/llvm-config.cpp8
-rw-r--r--tools/llvm-objdump/MachODump.cpp31
-rw-r--r--tools/llvm-opt-report/OptReport.cpp14
-rw-r--r--unittests/ExecutionEngine/Orc/RPCUtilsTest.cpp59
-rw-r--r--unittests/IR/UserTest.cpp25
-rw-r--r--utils/unittest/CMakeLists.txt4
-rwxr-xr-xutils/update_test_checks.py51
167 files changed, 10583 insertions, 4006 deletions
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index d76f1293d02c..4288cf4bdd04 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -316,9 +316,9 @@ else()
endif()
endif()
-check_cxx_compiler_flag("-Wno-variadic-macros" SUPPORTS_NO_VARIADIC_MACROS_FLAG)
-check_cxx_compiler_flag("-Wno-gnu-zero-variadic-macro-arguments"
- SUPPORTS_NO_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
+check_cxx_compiler_flag("-Wvariadic-macros" SUPPORTS_VARIADIC_MACROS_FLAG)
+check_cxx_compiler_flag("-Wgnu-zero-variadic-macro-arguments"
+ SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
set(USE_NO_MAYBE_UNINITIALIZED 0)
set(USE_NO_UNINITIALIZED 0)
@@ -462,13 +462,6 @@ if( MSVC )
if(LLVM_ENABLE_DIA_SDK AND NOT HAVE_DIA_SDK)
message(FATAL_ERROR "DIA SDK not found. If you have both VS 2012 and 2013 installed, you may need to uninstall the former and re-install the latter afterwards.")
endif()
-
- # Normalize to 0/1 for lit.site.cfg
- if(LLVM_ENABLE_DIA_SDK)
- set(LLVM_ENABLE_DIA_SDK 1)
- else()
- set(LLVM_ENABLE_DIA_SDK 0)
- endif()
else()
set(LLVM_ENABLE_DIA_SDK 0)
endif( MSVC )
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index fbef1d04eac4..56ba1479d7ee 100755
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -1011,11 +1011,11 @@ function(add_unittest test_suite test_name)
list(APPEND LLVM_COMPILE_DEFINITIONS GTEST_HAS_PTHREAD=0)
endif ()
- if (SUPPORTS_NO_VARIADIC_MACROS_FLAG)
+ if (SUPPORTS_VARIADIC_MACROS_FLAG)
list(APPEND LLVM_COMPILE_FLAGS "-Wno-variadic-macros")
endif ()
# Some parts of gtest rely on this GNU extension, don't warn on it.
- if(SUPPORTS_NO_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
+ if(SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
list(APPEND LLVM_COMPILE_FLAGS "-Wno-gnu-zero-variadic-macro-arguments")
endif()
@@ -1067,6 +1067,19 @@ function(llvm_add_go_executable binary pkgpath)
endif()
endfunction()
+# This function canonicalize the CMake variables passed by names
+# from CMake boolean to 0/1 suitable for passing into Python or C++,
+# in place.
+function(llvm_canonicalize_cmake_booleans)
+ foreach(var ${ARGN})
+ if(${var})
+ set(${var} 1 PARENT_SCOPE)
+ else()
+ set(${var} 0 PARENT_SCOPE)
+ endif()
+ endforeach()
+endfunction(llvm_canonicalize_cmake_booleans)
+
# This function provides an automatic way to 'configure'-like generate a file
# based on a set of common and custom variables, specifically targeting the
# variables needed for the 'lit.site.cfg' files. This function bundles the
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 9dcffe1ac5fb..1a93f9aa5fd2 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -1491,6 +1491,8 @@ public:
void print(raw_ostream &OS) const;
void verify() const;
+ bool invalidate(Function &F, const PreservedAnalyses &PA,
+ FunctionAnalysisManager::Invalidator &Inv);
/// Collect parametric terms occurring in step expressions (first step of
/// delinearization).
diff --git a/include/llvm/Analysis/TargetLibraryInfo.h b/include/llvm/Analysis/TargetLibraryInfo.h
index 196fbc7faa8d..8675882431d5 100644
--- a/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/include/llvm/Analysis/TargetLibraryInfo.h
@@ -290,7 +290,7 @@ public:
}
/// Returns extension attribute kind to be used for i32 parameters
- /// correpsonding to C-level int or unsigned int. May be zeroext, signext,
+ /// corresponding to C-level int or unsigned int. May be zeroext, signext,
/// or none.
Attribute::AttrKind getExtAttrForI32Param(bool Signed = true) const {
if (Impl->ShouldExtI32Param)
@@ -301,7 +301,7 @@ public:
}
/// Returns extension attribute kind to be used for i32 return values
- /// correpsonding to C-level int or unsigned int. May be zeroext, signext,
+ /// corresponding to C-level int or unsigned int. May be zeroext, signext,
/// or none.
Attribute::AttrKind getExtAttrForI32Return(bool Signed = true) const {
if (Impl->ShouldExtI32Return)
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index 92a9896d7a18..f3f5e324d76a 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -308,6 +308,16 @@ public:
// Iteration support for live in sets. These sets are kept in sorted
// order by their register number.
typedef LiveInVector::const_iterator livein_iterator;
+#ifndef NDEBUG
+ /// Unlike livein_begin, this method does not check that the liveness
+ /// information is accurate. Still for debug purposes it may be useful
+ /// to have iterators that won't assert if the liveness information
+ /// is not current.
+ livein_iterator livein_begin_dbg() const { return LiveIns.begin(); }
+ iterator_range<livein_iterator> liveins_dbg() const {
+ return make_range(livein_begin_dbg(), livein_end());
+ }
+#endif
livein_iterator livein_begin() const;
livein_iterator livein_end() const { return LiveIns.end(); }
bool livein_empty() const { return LiveIns.empty(); }
diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index 2fab8137564e..4600c2c0f10c 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h
@@ -148,8 +148,7 @@ class MachineFrameInfo {
/// grouping overaligned allocas into a "secondary stack frame" and
/// then only use a single alloca to allocate this frame and only a
/// single virtual register to access it. Currently, without such an
- /// optimization, each such alloca gets it's own dynamic
- /// realignment.
+ /// optimization, each such alloca gets its own dynamic realignment.
bool StackRealignable;
/// Whether the function has the \c alignstack attribute.
diff --git a/include/llvm/DebugInfo/MSF/StreamArray.h b/include/llvm/DebugInfo/MSF/StreamArray.h
index d8b74bc75c94..3bba80d807f3 100644
--- a/include/llvm/DebugInfo/MSF/StreamArray.h
+++ b/include/llvm/DebugInfo/MSF/StreamArray.h
@@ -11,6 +11,7 @@
#define LLVM_DEBUGINFO_MSF_STREAMARRAY_H
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/iterator.h"
#include "llvm/DebugInfo/MSF/StreamRef.h"
#include "llvm/Support/Error.h"
#include <cassert>
@@ -107,7 +108,10 @@ private:
Extractor E;
};
-template <typename ValueType, typename Extractor> class VarStreamArrayIterator {
+template <typename ValueType, typename Extractor>
+class VarStreamArrayIterator
+ : public iterator_facade_base<VarStreamArrayIterator<ValueType, Extractor>,
+ std::forward_iterator_tag, ValueType> {
typedef VarStreamArrayIterator<ValueType, Extractor> IterType;
typedef VarStreamArray<ValueType, Extractor> ArrayType;
@@ -144,41 +148,39 @@ public:
return false;
}
- bool operator!=(const IterType &R) { return !(*this == R); }
-
const ValueType &operator*() const {
assert(Array && !HasError);
return ThisValue;
}
- IterType &operator++() {
- // We are done with the current record, discard it so that we are
- // positioned at the next record.
- IterRef = IterRef.drop_front(ThisLen);
- if (IterRef.getLength() == 0) {
- // There is nothing after the current record, we must make this an end
- // iterator.
- moveToEnd();
- } else {
- // There is some data after the current record.
- auto EC = Extract(IterRef, ThisLen, ThisValue);
- if (EC) {
- consumeError(std::move(EC));
- markError();
- } else if (ThisLen == 0) {
- // An empty record? Make this an end iterator.
+ IterType &operator+=(std::ptrdiff_t N) {
+ while (N > 0) {
+ // We are done with the current record, discard it so that we are
+ // positioned at the next record.
+ IterRef = IterRef.drop_front(ThisLen);
+ if (IterRef.getLength() == 0) {
+ // There is nothing after the current record, we must make this an end
+ // iterator.
moveToEnd();
+ return *this;
+ } else {
+ // There is some data after the current record.
+ auto EC = Extract(IterRef, ThisLen, ThisValue);
+ if (EC) {
+ consumeError(std::move(EC));
+ markError();
+ return *this;
+ } else if (ThisLen == 0) {
+ // An empty record? Make this an end iterator.
+ moveToEnd();
+ return *this;
+ }
}
+ --N;
}
return *this;
}
- IterType operator++(int) {
- IterType Original = *this;
- ++*this;
- return Original;
- }
-
private:
void moveToEnd() {
Array = nullptr;
@@ -211,6 +213,16 @@ public:
assert(Stream.getLength() % sizeof(T) == 0);
}
+ bool operator==(const FixedStreamArray<T> &Other) const {
+ return Stream == Other.Stream;
+ }
+
+ bool operator!=(const FixedStreamArray<T> &Other) const {
+ return !(*this == Other);
+ }
+
+ FixedStreamArray &operator=(const FixedStreamArray &) = default;
+
const T &operator[](uint32_t Index) const {
assert(Index < size());
uint32_t Off = Index * sizeof(T);
@@ -226,6 +238,8 @@ public:
uint32_t size() const { return Stream.getLength() / sizeof(T); }
+ bool empty() const { return size() == 0; }
+
FixedStreamArrayIterator<T> begin() const {
return FixedStreamArrayIterator<T>(*this, 0);
}
@@ -240,36 +254,53 @@ private:
ReadableStreamRef Stream;
};
-template <typename T> class FixedStreamArrayIterator {
+template <typename T>
+class FixedStreamArrayIterator
+ : public iterator_facade_base<FixedStreamArrayIterator<T>,
+ std::random_access_iterator_tag, T> {
+
public:
FixedStreamArrayIterator(const FixedStreamArray<T> &Array, uint32_t Index)
: Array(Array), Index(Index) {}
- bool operator==(const FixedStreamArrayIterator<T> &R) {
- assert(&Array == &R.Array);
- return Index == R.Index;
+ FixedStreamArrayIterator<T> &
+ operator=(const FixedStreamArrayIterator<T> &Other) {
+ Array = Other.Array;
+ Index = Other.Index;
+ return *this;
}
- bool operator!=(const FixedStreamArrayIterator<T> &R) {
- return !(*this == R);
+ const T &operator*() const { return Array[Index]; }
+
+ bool operator==(const FixedStreamArrayIterator<T> &R) const {
+ assert(Array == R.Array);
+ return (Index == R.Index) && (Array == R.Array);
}
- const T &operator*() const { return Array[Index]; }
+ FixedStreamArrayIterator<T> &operator+=(std::ptrdiff_t N) {
+ Index += N;
+ return *this;
+ }
- FixedStreamArrayIterator<T> &operator++() {
- assert(Index < Array.size());
- ++Index;
+ FixedStreamArrayIterator<T> &operator-=(std::ptrdiff_t N) {
+ assert(Index >= N);
+ Index -= N;
return *this;
}
- FixedStreamArrayIterator<T> operator++(int) {
- FixedStreamArrayIterator<T> Original = *this;
- ++*this;
- return Original;
+ std::ptrdiff_t operator-(const FixedStreamArrayIterator<T> &R) const {
+ assert(Array == R.Array);
+ assert(Index >= R.Index);
+ return Index - R.Index;
+ }
+
+ bool operator<(const FixedStreamArrayIterator<T> &RHS) const {
+ assert(Array == RHS.Array);
+ return Index < RHS.Index;
}
private:
- const FixedStreamArray<T> &Array;
+ FixedStreamArray<T> Array;
uint32_t Index;
};
diff --git a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h
index ab2b0fad89fd..3086ef0cdf80 100644
--- a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h
+++ b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetRPCAPI.h
@@ -83,7 +83,7 @@ public:
namespace remote {
class OrcRemoteTargetRPCAPI
- : public rpc::SingleThreadedRPC<rpc::RawByteChannel> {
+ : public rpc::SingleThreadedRPCEndpoint<rpc::RawByteChannel> {
protected:
class ResourceIdMgr {
public:
@@ -108,7 +108,7 @@ protected:
public:
// FIXME: Remove constructors once MSVC supports synthesizing move-ops.
OrcRemoteTargetRPCAPI(rpc::RawByteChannel &C)
- : rpc::SingleThreadedRPC<rpc::RawByteChannel>(C, true) {}
+ : rpc::SingleThreadedRPCEndpoint<rpc::RawByteChannel>(C, true) {}
class CallIntVoid
: public rpc::Function<CallIntVoid, int32_t(JITTargetAddress Addr)> {
diff --git a/include/llvm/ExecutionEngine/Orc/RPCUtils.h b/include/llvm/ExecutionEngine/Orc/RPCUtils.h
index f51fbe153a41..37e2e66e5af4 100644
--- a/include/llvm/ExecutionEngine/Orc/RPCUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/RPCUtils.h
@@ -702,7 +702,7 @@ public:
/// sync.
template <typename ImplT, typename ChannelT, typename FunctionIdT,
typename SequenceNumberT>
-class RPCBase {
+class RPCEndpointBase {
protected:
class OrcRPCInvalid : public Function<OrcRPCInvalid, void()> {
public:
@@ -747,7 +747,7 @@ protected:
public:
/// Construct an RPC instance on a channel.
- RPCBase(ChannelT &C, bool LazyAutoNegotiation)
+ RPCEndpointBase(ChannelT &C, bool LazyAutoNegotiation)
: C(C), LazyAutoNegotiation(LazyAutoNegotiation) {
// Hold ResponseId in a special variable, since we expect Response to be
// called relatively frequently, and want to avoid the map lookup.
@@ -788,15 +788,21 @@ public:
return FnIdOrErr.takeError();
}
- // Allocate a sequence number.
- auto SeqNo = SequenceNumberMgr.getSequenceNumber();
- assert(!PendingResponses.count(SeqNo) &&
- "Sequence number already allocated");
+ SequenceNumberT SeqNo; // initialized in locked scope below.
+ {
+ // Lock the pending responses map and sequence number manager.
+ std::lock_guard<std::mutex> Lock(ResponsesMutex);
+
+ // Allocate a sequence number.
+ SeqNo = SequenceNumberMgr.getSequenceNumber();
+ assert(!PendingResponses.count(SeqNo) &&
+ "Sequence number already allocated");
- // Install the user handler.
- PendingResponses[SeqNo] =
+ // Install the user handler.
+ PendingResponses[SeqNo] =
detail::createResponseHandler<ChannelT, typename Func::ReturnType>(
std::move(Handler));
+ }
// Open the function call message.
if (auto Err = C.startSendMessage(FnId, SeqNo)) {
@@ -863,11 +869,33 @@ public:
return detail::ReadArgs<ArgTs...>(Args...);
}
+ /// Abandon all outstanding result handlers.
+ ///
+ /// This will call all currently registered result handlers to receive an
+ /// "abandoned" error as their argument. This is used internally by the RPC
+ /// in error situations, but can also be called directly by clients who are
+ /// disconnecting from the remote and don't or can't expect responses to their
+ /// outstanding calls. (Especially for outstanding blocking calls, calling
+ /// this function may be necessary to avoid dead threads).
+ void abandonPendingResponses() {
+ // Lock the pending responses map and sequence number manager.
+ std::lock_guard<std::mutex> Lock(ResponsesMutex);
+
+ for (auto &KV : PendingResponses)
+ KV.second->abandon();
+ PendingResponses.clear();
+ SequenceNumberMgr.reset();
+ }
+
protected:
// The LaunchPolicy type allows a launch policy to be specified when adding
// a function handler. See addHandlerImpl.
using LaunchPolicy = std::function<Error(std::function<Error()>)>;
+ FunctionIdT getInvalidFunctionId() const {
+ return FnIdAllocator.getInvalidId();
+ }
+
/// Add the given handler to the handler map and make it available for
/// autonegotiation and execution.
template <typename Func, typename HandlerT>
@@ -884,28 +912,32 @@ protected:
wrapHandler<Func>(std::move(Handler), std::move(Launch));
}
- // Abandon all outstanding results.
- void abandonPendingResponses() {
- for (auto &KV : PendingResponses)
- KV.second->abandon();
- PendingResponses.clear();
- SequenceNumberMgr.reset();
- }
-
Error handleResponse(SequenceNumberT SeqNo) {
- auto I = PendingResponses.find(SeqNo);
- if (I == PendingResponses.end()) {
- abandonPendingResponses();
- return orcError(OrcErrorCode::UnexpectedRPCResponse);
+ using Handler = typename decltype(PendingResponses)::mapped_type;
+ Handler PRHandler;
+
+ {
+ // Lock the pending responses map and sequence number manager.
+ std::unique_lock<std::mutex> Lock(ResponsesMutex);
+ auto I = PendingResponses.find(SeqNo);
+
+ if (I != PendingResponses.end()) {
+ PRHandler = std::move(I->second);
+ PendingResponses.erase(I);
+ SequenceNumberMgr.releaseSequenceNumber(SeqNo);
+ } else {
+ // Unlock the pending results map to prevent recursive lock.
+ Lock.unlock();
+ abandonPendingResponses();
+ return orcError(OrcErrorCode::UnexpectedRPCResponse);
+ }
}
- auto PRHandler = std::move(I->second);
- PendingResponses.erase(I);
- SequenceNumberMgr.releaseSequenceNumber(SeqNo);
+ assert(PRHandler &&
+ "If we didn't find a response handler we should have bailed out");
if (auto Err = PRHandler->handleResponse(C)) {
abandonPendingResponses();
- SequenceNumberMgr.reset();
return Err;
}
@@ -915,7 +947,7 @@ protected:
FunctionIdT handleNegotiate(const std::string &Name) {
auto I = LocalFunctionIds.find(Name);
if (I == LocalFunctionIds.end())
- return FnIdAllocator.getInvalidId();
+ return getInvalidFunctionId();
return I->second;
}
@@ -938,7 +970,7 @@ protected:
// If autonegotiation indicates that the remote end doesn't support this
// function, return an unknown function error.
- if (RemoteId == FnIdAllocator.getInvalidId())
+ if (RemoteId == getInvalidFunctionId())
return orcError(OrcErrorCode::UnknownRPCFunction);
// Autonegotiation succeeded and returned a valid id. Update the map and
@@ -1012,6 +1044,7 @@ protected:
std::map<FunctionIdT, WrappedHandlerFn> Handlers;
+ std::mutex ResponsesMutex;
detail::SequenceNumberManager<SequenceNumberT> SequenceNumberMgr;
std::map<SequenceNumberT, std::unique_ptr<detail::ResponseHandler<ChannelT>>>
PendingResponses;
@@ -1021,17 +1054,18 @@ protected:
template <typename ChannelT, typename FunctionIdT = uint32_t,
typename SequenceNumberT = uint32_t>
-class MultiThreadedRPC
- : public detail::RPCBase<
- MultiThreadedRPC<ChannelT, FunctionIdT, SequenceNumberT>, ChannelT,
- FunctionIdT, SequenceNumberT> {
+class MultiThreadedRPCEndpoint
+ : public detail::RPCEndpointBase<
+ MultiThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
+ ChannelT, FunctionIdT, SequenceNumberT> {
private:
using BaseClass =
- detail::RPCBase<MultiThreadedRPC<ChannelT, FunctionIdT, SequenceNumberT>,
- ChannelT, FunctionIdT, SequenceNumberT>;
+ detail::RPCEndpointBase<
+ MultiThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
+ ChannelT, FunctionIdT, SequenceNumberT>;
public:
- MultiThreadedRPC(ChannelT &C, bool LazyAutoNegotiation)
+ MultiThreadedRPCEndpoint(ChannelT &C, bool LazyAutoNegotiation)
: BaseClass(C, LazyAutoNegotiation) {}
/// The LaunchPolicy type allows a launch policy to be specified when adding
@@ -1061,30 +1095,41 @@ public:
std::move(Launch));
}
+ /// Add a class-method as a handler.
+ template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
+ void addHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...),
+ LaunchPolicy Launch = LaunchPolicy()) {
+ addHandler<Func>(
+ detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method),
+ Launch);
+ }
+
/// Negotiate a function id for Func with the other end of the channel.
- template <typename Func> Error negotiateFunction() {
+ template <typename Func> Error negotiateFunction(bool Retry = false) {
using OrcRPCNegotiate = typename BaseClass::OrcRPCNegotiate;
+ // Check if we already have a function id...
+ auto I = this->RemoteFunctionIds.find(Func::getPrototype());
+ if (I != this->RemoteFunctionIds.end()) {
+ // If it's valid there's nothing left to do.
+ if (I->second != this->getInvalidFunctionId())
+ return Error::success();
+ // If it's invalid and we can't re-attempt negotiation, throw an error.
+ if (!Retry)
+ return orcError(OrcErrorCode::UnknownRPCFunction);
+ }
+
+ // We don't have a function id for Func yet, call the remote to try to
+ // negotiate one.
if (auto RemoteIdOrErr = callB<OrcRPCNegotiate>(Func::getPrototype())) {
this->RemoteFunctionIds[Func::getPrototype()] = *RemoteIdOrErr;
+ if (*RemoteIdOrErr == this->getInvalidFunctionId())
+ return orcError(OrcErrorCode::UnknownRPCFunction);
return Error::success();
} else
return RemoteIdOrErr.takeError();
}
- /// Convenience method for negotiating multiple functions at once.
- template <typename Func> Error negotiateFunctions() {
- return negotiateFunction<Func>();
- }
-
- /// Convenience method for negotiating multiple functions at once.
- template <typename Func1, typename Func2, typename... Funcs>
- Error negotiateFunctions() {
- if (auto Err = negotiateFunction<Func1>())
- return Err;
- return negotiateFunctions<Func2, Funcs...>();
- }
-
/// Return type for non-blocking call primitives.
template <typename Func>
using NonBlockingCallResult = typename detail::ResultTraits<
@@ -1169,19 +1214,20 @@ public:
template <typename ChannelT, typename FunctionIdT = uint32_t,
typename SequenceNumberT = uint32_t>
-class SingleThreadedRPC
- : public detail::RPCBase<
- SingleThreadedRPC<ChannelT, FunctionIdT, SequenceNumberT>, ChannelT,
- FunctionIdT, SequenceNumberT> {
+class SingleThreadedRPCEndpoint
+ : public detail::RPCEndpointBase<
+ SingleThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
+ ChannelT, FunctionIdT, SequenceNumberT> {
private:
using BaseClass =
- detail::RPCBase<SingleThreadedRPC<ChannelT, FunctionIdT, SequenceNumberT>,
- ChannelT, FunctionIdT, SequenceNumberT>;
+ detail::RPCEndpointBase<
+ SingleThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
+ ChannelT, FunctionIdT, SequenceNumberT>;
using LaunchPolicy = typename BaseClass::LaunchPolicy;
public:
- SingleThreadedRPC(ChannelT &C, bool LazyAutoNegotiation)
+ SingleThreadedRPCEndpoint(ChannelT &C, bool LazyAutoNegotiation)
: BaseClass(C, LazyAutoNegotiation) {}
template <typename Func, typename HandlerT>
@@ -1197,29 +1243,31 @@ public:
}
/// Negotiate a function id for Func with the other end of the channel.
- template <typename Func> Error negotiateFunction() {
+ template <typename Func> Error negotiateFunction(bool Retry = false) {
using OrcRPCNegotiate = typename BaseClass::OrcRPCNegotiate;
+ // Check if we already have a function id...
+ auto I = this->RemoteFunctionIds.find(Func::getPrototype());
+ if (I != this->RemoteFunctionIds.end()) {
+ // If it's valid there's nothing left to do.
+ if (I->second != this->getInvalidFunctionId())
+ return Error::success();
+ // If it's invalid and we can't re-attempt negotiation, throw an error.
+ if (!Retry)
+ return orcError(OrcErrorCode::UnknownRPCFunction);
+ }
+
+ // We don't have a function id for Func yet, call the remote to try to
+ // negotiate one.
if (auto RemoteIdOrErr = callB<OrcRPCNegotiate>(Func::getPrototype())) {
this->RemoteFunctionIds[Func::getPrototype()] = *RemoteIdOrErr;
+ if (*RemoteIdOrErr == this->getInvalidFunctionId())
+ return orcError(OrcErrorCode::UnknownRPCFunction);
return Error::success();
} else
return RemoteIdOrErr.takeError();
}
- /// Convenience method for negotiating multiple functions at once.
- template <typename Func> Error negotiateFunctions() {
- return negotiateFunction<Func>();
- }
-
- /// Convenience method for negotiating multiple functions at once.
- template <typename Func1, typename Func2, typename... Funcs>
- Error negotiateFunctions() {
- if (auto Err = negotiateFunction<Func1>())
- return Err;
- return negotiateFunctions<Func2, Funcs...>();
- }
-
template <typename Func, typename... ArgTs,
typename AltRetT = typename Func::ReturnType>
typename detail::ResultTraits<AltRetT>::ErrorReturnType
@@ -1332,6 +1380,68 @@ private:
uint32_t NumOutstandingCalls;
};
+/// @brief Convenience class for grouping RPC Functions into APIs that can be
+/// negotiated as a block.
+///
+template <typename... Funcs>
+class APICalls {
+public:
+
+ /// @brief Test whether this API contains Function F.
+ template <typename F>
+ class Contains {
+ public:
+ static const bool value = false;
+ };
+
+ /// @brief Negotiate all functions in this API.
+ template <typename RPCEndpoint>
+ static Error negotiate(RPCEndpoint &R) {
+ return Error::success();
+ }
+};
+
+template <typename Func, typename... Funcs>
+class APICalls<Func, Funcs...> {
+public:
+
+ template <typename F>
+ class Contains {
+ public:
+ static const bool value = std::is_same<F, Func>::value |
+ APICalls<Funcs...>::template Contains<F>::value;
+ };
+
+ template <typename RPCEndpoint>
+ static Error negotiate(RPCEndpoint &R) {
+ if (auto Err = R.template negotiateFunction<Func>())
+ return Err;
+ return APICalls<Funcs...>::negotiate(R);
+ }
+
+};
+
+template <typename... InnerFuncs, typename... Funcs>
+class APICalls<APICalls<InnerFuncs...>, Funcs...> {
+public:
+
+ template <typename F>
+ class Contains {
+ public:
+ static const bool value =
+ APICalls<InnerFuncs...>::template Contains<F>::value |
+ APICalls<Funcs...>::template Contains<F>::value;
+ };
+
+ template <typename RPCEndpoint>
+ static Error negotiate(RPCEndpoint &R) {
+ if (auto Err = APICalls<InnerFuncs...>::negotiate(R))
+ return Err;
+ return APICalls<Funcs...>::negotiate(R);
+ }
+
+};
+
} // end namespace rpc
} // end namespace orc
} // end namespace llvm
diff --git a/include/llvm/ExecutionEngine/Orc/RawByteChannel.h b/include/llvm/ExecutionEngine/Orc/RawByteChannel.h
index 83a7b9a844f2..3b6c84eb1965 100644
--- a/include/llvm/ExecutionEngine/Orc/RawByteChannel.h
+++ b/include/llvm/ExecutionEngine/Orc/RawByteChannel.h
@@ -48,9 +48,7 @@ public:
template <typename FunctionIdT, typename SequenceIdT>
Error startSendMessage(const FunctionIdT &FnId, const SequenceIdT &SeqNo) {
writeLock.lock();
- if (auto Err = serializeSeq(*this, FnId, SeqNo))
- return Err;
- return Error::success();
+ return serializeSeq(*this, FnId, SeqNo);
}
/// Notify the channel that we're ending a message send.
diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h
index a8c8ff9ef2eb..aeb66633f2c8 100644
--- a/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -28,14 +28,14 @@ template <> struct ScalarEnumerationTraits<TypeTestResolution::Kind> {
template <> struct MappingTraits<TypeTestResolution> {
static void mapping(IO &io, TypeTestResolution &res) {
- io.mapRequired("Kind", res.TheKind);
- io.mapRequired("SizeBitWidth", res.SizeBitWidth);
+ io.mapOptional("Kind", res.TheKind);
+ io.mapOptional("SizeBitWidth", res.SizeBitWidth);
}
};
template <> struct MappingTraits<TypeIdSummary> {
static void mapping(IO &io, TypeIdSummary& summary) {
- io.mapRequired("TTRes", summary.TTRes);
+ io.mapOptional("TTRes", summary.TTRes);
}
};
@@ -53,7 +53,7 @@ namespace yaml {
template <> struct MappingTraits<FunctionSummaryYaml> {
static void mapping(IO &io, FunctionSummaryYaml& summary) {
- io.mapRequired("TypeTests", summary.TypeTests);
+ io.mapOptional("TypeTests", summary.TypeTests);
}
};
@@ -100,8 +100,8 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
template <> struct MappingTraits<ModuleSummaryIndex> {
static void mapping(IO &io, ModuleSummaryIndex& index) {
- io.mapRequired("GlobalValueMap", index.GlobalValueMap);
- io.mapRequired("TypeIdMap", index.TypeIdMap);
+ io.mapOptional("GlobalValueMap", index.GlobalValueMap);
+ io.mapOptional("TypeIdMap", index.TypeIdMap);
}
};
diff --git a/include/llvm/IR/PassManager.h b/include/llvm/IR/PassManager.h
index 7a63956f1cdb..2e95f67a14a9 100644
--- a/include/llvm/IR/PassManager.h
+++ b/include/llvm/IR/PassManager.h
@@ -879,18 +879,22 @@ extern template class AnalysisManager<Function>;
/// \brief Convenience typedef for the Function analysis manager.
typedef AnalysisManager<Function> FunctionAnalysisManager;
-/// \brief A module analysis which acts as a proxy for a function analysis
-/// manager.
+/// \brief An analysis over an "outer" IR unit that provides access to an
+/// analysis manager over an "inner" IR unit. The inner unit must be contained
+/// in the outer unit.
///
-/// This primarily proxies invalidation information from the module analysis
-/// manager and module pass manager to a function analysis manager. You should
-/// never use a function analysis manager from within (transitively) a module
-/// pass manager unless your parent module pass has received a proxy result
-/// object for it.
+/// Fore example, InnerAnalysisManagerProxy<FunctionAnalysisManager, Module> is
+/// an analysis over Modules (the "outer" unit) that provides access to a
+/// Function analysis manager. The FunctionAnalysisManager is the "inner"
+/// manager being proxied, and Functions are the "inner" unit. The inner/outer
+/// relationship is valid because each Function is contained in one Module.
///
-/// Note that the proxy's result is a move-only object and represents ownership
-/// of the validity of the analyses in the \c FunctionAnalysisManager it
-/// provides.
+/// If you're (transitively) within a pass manager for an IR unit U that
+/// contains IR unit V, you should never use an analysis manager over V, except
+/// via one of these proxies.
+///
+/// Note that the proxy's result is a move-only RAII object. The validity of
+/// the analyses in the inner analysis manager is tied to its lifetime.
template <typename AnalysisManagerT, typename IRUnitT, typename... ExtraArgTs>
class InnerAnalysisManagerProxy
: public AnalysisInfoMixin<
@@ -926,23 +930,16 @@ public:
/// \brief Accessor for the analysis manager.
AnalysisManagerT &getManager() { return *InnerAM; }
- /// \brief Handler for invalidation of the outer IR unit.
- ///
- /// If this analysis itself is preserved, then we assume that the set of \c
- /// IR units that the inner analysis manager controls hasn't changed and
- /// thus we don't need to invalidate *all* cached data associated with any
- /// \c IRUnitT* in the \c AnalysisManagerT.
+ /// \brief Handler for invalidation of the outer IR unit, \c IRUnitT.
///
- /// Regardless of whether this analysis is marked as preserved, all of the
- /// analyses in the \c AnalysisManagerT are potentially invalidated (for
- /// the relevant inner set of their IR units) based on the set of preserved
- /// analyses.
+ /// If the proxy analysis itself is not preserved, we assume that the set of
+ /// inner IR objects contained in IRUnit may have changed. In this case,
+ /// we have to call \c clear() on the inner analysis manager, as it may now
+ /// have stale pointers to its inner IR objects.
///
- /// Because this needs to understand the mapping from one IR unit to an
- /// inner IR unit, this method isn't defined in the primary template.
- /// Instead, each specialization of this template will need to provide an
- /// explicit specialization of this method to handle that particular pair
- /// of IR unit and inner AnalysisManagerT.
+ /// Regardless of whether the proxy analysis is marked as preserved, all of
+ /// the analyses in the inner analysis manager are potentially invalidated
+ /// based on the set of preserved analyses.
bool invalidate(
IRUnitT &IR, const PreservedAnalyses &PA,
typename AnalysisManager<IRUnitT, ExtraArgTs...>::Invalidator &Inv);
@@ -956,13 +953,9 @@ public:
/// \brief Run the analysis pass and create our proxy result object.
///
- /// This doesn't do any interesting work, it is primarily used to insert our
- /// proxy result object into the module analysis cache so that we can proxy
- /// invalidation to the function analysis manager.
- ///
- /// In debug builds, it will also assert that the analysis manager is empty
- /// as no queries should arrive at the function analysis manager prior to
- /// this analysis being requested.
+ /// This doesn't do any interesting work; it is primarily used to insert our
+ /// proxy result object into the outer analysis cache so that we can proxy
+ /// invalidation to the inner analysis manager.
Result run(IRUnitT &IR, AnalysisManager<IRUnitT, ExtraArgTs...> &AM,
ExtraArgTs...) {
return Result(*InnerAM);
@@ -996,22 +989,24 @@ bool FunctionAnalysisManagerModuleProxy::Result::invalidate(
extern template class InnerAnalysisManagerProxy<FunctionAnalysisManager,
Module>;
-/// \brief A function analysis which acts as a proxy for a module analysis
-/// manager.
+/// \brief An analysis over an "inner" IR unit that provides access to an
+/// analysis manager over a "outer" IR unit. The inner unit must be contained
+/// in the outer unit.
///
-/// This primarily provides an accessor to a parent module analysis manager to
-/// function passes. Only the const interface of the module analysis manager is
-/// provided to indicate that once inside of a function analysis pass you
-/// cannot request a module analysis to actually run. Instead, the user must
-/// rely on the \c getCachedResult API.
+/// For example OuterAnalysisManagerProxy<ModuleAnalysisManager, Function> is an
+/// analysis over Functions (the "inner" unit) which provides access to a Module
+/// analysis manager. The ModuleAnalysisManager is the "outer" manager being
+/// proxied, and Modules are the "outer" IR unit. The inner/outer relationship
+/// is valid because each Function is contained in one Module.
///
-/// The invalidation provided by this proxy involves tracking when an
-/// invalidation event in the outer analysis manager needs to trigger an
-/// invalidation of a particular analysis on this IR unit.
+/// This proxy only exposes the const interface of the outer analysis manager,
+/// to indicate that you cannot cause an outer analysis to run from within an
+/// inner pass. Instead, you must rely on the \c getCachedResult API.
///
-/// Because outer analyses aren't invalidated while these IR units are being
-/// precessed, we have to register and handle these as deferred invalidation
-/// events.
+/// This proxy doesn't manage invalidation in any way -- that is handled by the
+/// recursive return path of each layer of the pass manager. A consequence of
+/// this is the outer analyses may be stale. We invalidate the outer analyses
+/// only when we're done running passes over the inner IR units.
template <typename AnalysisManagerT, typename IRUnitT, typename... ExtraArgTs>
class OuterAnalysisManagerProxy
: public AnalysisInfoMixin<
@@ -1024,7 +1019,7 @@ public:
const AnalysisManagerT &getManager() const { return *AM; }
- /// \brief Handle invalidation by ignoring it, this pass is immutable.
+ /// \brief Handle invalidation by ignoring it; this pass is immutable.
bool invalidate(
IRUnitT &, const PreservedAnalyses &,
typename AnalysisManager<IRUnitT, ExtraArgTs...>::Invalidator &) {
@@ -1089,18 +1084,15 @@ AnalysisKey
extern template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
Function>;
-/// Provide the \c ModuleAnalysisManager to \c Fucntion proxy.
+/// Provide the \c ModuleAnalysisManager to \c Function proxy.
typedef OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>
ModuleAnalysisManagerFunctionProxy;
/// \brief Trivial adaptor that maps from a module to its functions.
///
/// Designed to allow composition of a FunctionPass(Manager) and
-/// a ModulePassManager. Note that if this pass is constructed with a pointer
-/// to a \c ModuleAnalysisManager it will run the
-/// \c FunctionAnalysisManagerModuleProxy analysis prior to running the function
-/// pass over the module to enable a \c FunctionAnalysisManager to be used
-/// within this run safely.
+/// a ModulePassManager, by running the FunctionPass(Manager) over every
+/// function in the module.
///
/// Function passes run within this adaptor can rely on having exclusive access
/// to the function they are run over. They should not read or modify any other
@@ -1115,6 +1107,10 @@ typedef OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>
/// module.
/// FIXME: Make the above true for all of LLVM's actual passes, some still
/// violate this principle.
+///
+/// Note that although function passes can access module analyses, module
+/// analyses are not invalidated while the function passes are running, so they
+/// may be stale. Function analyses will not be stale.
template <typename FunctionPassT>
class ModuleToFunctionPassAdaptor
: public PassInfoMixin<ModuleToFunctionPassAdaptor<FunctionPassT>> {
@@ -1124,7 +1120,6 @@ public:
/// \brief Runs the function pass across every function in the module.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) {
- // Setup the function analysis manager from its proxy.
FunctionAnalysisManager &FAM =
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
@@ -1145,10 +1140,11 @@ public:
PA.intersect(std::move(PassPA));
}
- // By definition we preserve the proxy. We also preserve all analyses on
- // Function units. This precludes *any* invalidation of function analyses
- // by the proxy, but that's OK because we've taken care to invalidate
- // analyses in the function analysis manager incrementally above.
+ // The FunctionAnalysisManagerModuleProxy is preserved because (we assume)
+ // the function passes we ran didn't add or remove any functions.
+ //
+ // We also preserve all analyses on Functions, because we did all the
+ // invalidation we needed to do above.
PA.preserveSet<AllAnalysesOn<Function>>();
PA.preserve<FunctionAnalysisManagerModuleProxy>();
return PA;
@@ -1166,7 +1162,7 @@ createModuleToFunctionPassAdaptor(FunctionPassT Pass) {
return ModuleToFunctionPassAdaptor<FunctionPassT>(std::move(Pass));
}
-/// \brief A template utility pass to force an analysis result to be available.
+/// \brief A utility pass template to force an analysis result to be available.
///
/// If there are extra arguments at the pass's run level there may also be
/// extra arguments to the analysis manager's \c getResult routine. We can't
@@ -1196,17 +1192,14 @@ struct RequireAnalysisPass
}
};
-/// \brief A template utility pass to force an analysis result to be
-/// invalidated.
-///
-/// This is a no-op pass which simply forces a specific analysis result to be
-/// invalidated when it is run.
+/// \brief A no-op pass template which simply forces a specific analysis result
+/// to be invalidated.
template <typename AnalysisT>
struct InvalidateAnalysisPass
: PassInfoMixin<InvalidateAnalysisPass<AnalysisT>> {
/// \brief Run this pass over some unit of IR.
///
- /// This pass can be run over any unit of IR and use any analysis manager
+ /// This pass can be run over any unit of IR and use any analysis manager,
/// provided they satisfy the basic API requirements. When this pass is
/// created, these methods can be instantiated to satisfy whatever the
/// context requires.
@@ -1218,10 +1211,10 @@ struct InvalidateAnalysisPass
}
};
-/// \brief A utility pass that does nothing but preserves no analyses.
+/// \brief A utility pass that does nothing, but preserves no analyses.
///
-/// As a consequence fo not preserving any analyses, this pass will force all
-/// analysis passes to be re-run to produce fresh results if any are needed.
+/// Because this preserves no analyses, any analysis passes queried after this
+/// pass runs will recompute fresh results.
struct InvalidateAllAnalysesPass : PassInfoMixin<InvalidateAllAnalysesPass> {
/// \brief Run this pass over some unit of IR.
template <typename IRUnitT, typename AnalysisManagerT, typename... ExtraArgTs>
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index e6fe97484580..c907d6b670b5 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -238,6 +238,26 @@ public:
return make_range(value_op_begin(), value_op_end());
}
+ struct const_value_op_iterator
+ : iterator_adaptor_base<const_value_op_iterator, const_op_iterator,
+ std::random_access_iterator_tag, const Value *,
+ ptrdiff_t, const Value *, const Value *> {
+ explicit const_value_op_iterator(const Use *U = nullptr) :
+ iterator_adaptor_base(U) {}
+ const Value *operator*() const { return *I; }
+ const Value *operator->() const { return operator*(); }
+ };
+
+ const_value_op_iterator value_op_begin() const {
+ return const_value_op_iterator(op_begin());
+ }
+ const_value_op_iterator value_op_end() const {
+ return const_value_op_iterator(op_end());
+ }
+ iterator_range<const_value_op_iterator> operand_values() const {
+ return make_range(value_op_begin(), value_op_end());
+ }
+
/// \brief Drop all references to operands.
///
/// This function is in charge of "letting go" of all objects that this User
diff --git a/include/llvm/Support/Path.h b/include/llvm/Support/Path.h
index 0513350d446b..2bbcef0c293f 100644
--- a/include/llvm/Support/Path.h
+++ b/include/llvm/Support/Path.h
@@ -207,6 +207,14 @@ void native(const Twine &path, SmallVectorImpl<char> &result);
/// @param path A path that is transformed to native format.
void native(SmallVectorImpl<char> &path);
+/// @brief Replaces backslashes with slashes if Windows.
+///
+/// @param path processed path
+/// @result The result of replacing backslashes with forward slashes if Windows.
+/// On Unix, this function is a no-op because backslashes are valid path
+/// chracters.
+std::string convert_to_slash(StringRef path);
+
/// @}
/// @name Lexical Observers
/// @{
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index 4bebc863b4a9..dd55062e56f1 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -215,9 +215,20 @@ ModulePass *createMetaRenamerPass();
/// manager.
ModulePass *createBarrierNoopPass();
+/// What to do with the summary when running the LowerTypeTests pass.
+enum class LowerTypeTestsSummaryAction {
+ None, ///< Do nothing.
+ Import, ///< Import typeid resolutions from summary and globals.
+ Export, ///< Export typeid resolutions to summary and globals.
+};
+
/// \brief This pass lowers type metadata and the llvm.type.test intrinsic to
/// bitsets.
-ModulePass *createLowerTypeTestsPass();
+/// \param Action What to do with the summary passed as Index.
+/// \param Index The summary to use for importing or exporting, this can be null
+/// when Action is None.
+ModulePass *createLowerTypeTestsPass(LowerTypeTestsSummaryAction Action,
+ ModuleSummaryIndex *Index);
/// \brief This pass export CFI checks for use by external modules.
ModulePass *createCrossDSOCFIPass();
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 9f9ce467337e..abfb24f0fe50 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -21,7 +21,6 @@
#include <vector>
namespace llvm {
-class ModuleSummaryIndex;
class Pass;
class TargetLibraryInfoImpl;
class TargetMachine;
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index b4686a1ff175..8da2f0981d0c 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -1106,6 +1106,16 @@ static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q,
if (Value *V = SimplifyDiv(Instruction::UDiv, Op0, Op1, Q, MaxRecurse))
return V;
+ // udiv %V, C -> 0 if %V < C
+ if (MaxRecurse) {
+ if (Constant *C = dyn_cast_or_null<Constant>(SimplifyICmpInst(
+ ICmpInst::ICMP_ULT, Op0, Op1, Q, MaxRecurse - 1))) {
+ if (C->isAllOnesValue()) {
+ return Constant::getNullValue(Op0->getType());
+ }
+ }
+ }
+
return nullptr;
}
@@ -1247,6 +1257,16 @@ static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q,
if (Value *V = SimplifyRem(Instruction::URem, Op0, Op1, Q, MaxRecurse))
return V;
+ // urem %V, C -> %V if %V < C
+ if (MaxRecurse) {
+ if (Constant *C = dyn_cast_or_null<Constant>(SimplifyICmpInst(
+ ICmpInst::ICMP_ULT, Op0, Op1, Q, MaxRecurse - 1))) {
+ if (C->isAllOnesValue()) {
+ return Op0;
+ }
+ }
+ }
+
return nullptr;
}
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 19c0171740c9..3d85ef6988a9 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -179,9 +179,9 @@ bool Loop::isLCSSAForm(DominatorTree &DT) const {
}
bool Loop::isRecursivelyLCSSAForm(DominatorTree &DT, const LoopInfo &LI) const {
- // For each block we check that it doesn't have any uses outside of it's
- // innermost loop. This process will transitivelly guarntee that current loop
- // and all of the nested loops are in the LCSSA form.
+ // For each block we check that it doesn't have any uses outside of its
+ // innermost loop. This process will transitively guarantee that the current
+ // loop and all of the nested loops are in LCSSA form.
return all_of(this->blocks(), [&](const BasicBlock *BB) {
return isBlockInLCSSAForm(*LI.getLoopFor(BB), *BB, DT);
});
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 2746361ab4b5..e7415e623196 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -344,38 +344,24 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
if (!InvariantGroupMD)
return MemDepResult::getUnknown();
- Value *LoadOperand = LI->getPointerOperand();
+ // Take the ptr operand after all casts and geps 0. This way we can search
+ // cast graph down only.
+ Value *LoadOperand = LI->getPointerOperand()->stripPointerCasts();
+
// It's is not safe to walk the use list of global value, because function
// passes aren't allowed to look outside their functions.
+ // FIXME: this could be fixed by filtering instructions from outside
+ // of current function.
if (isa<GlobalValue>(LoadOperand))
return MemDepResult::getUnknown();
// Queue to process all pointers that are equivalent to load operand.
SmallVector<const Value *, 8> LoadOperandsQueue;
- SmallSet<const Value *, 14> SeenValues;
- auto TryInsertToQueue = [&](Value *V) {
- if (SeenValues.insert(V).second)
- LoadOperandsQueue.push_back(V);
- };
-
- TryInsertToQueue(LoadOperand);
+ LoadOperandsQueue.push_back(LoadOperand);
while (!LoadOperandsQueue.empty()) {
const Value *Ptr = LoadOperandsQueue.pop_back_val();
- assert(Ptr);
- if (isa<GlobalValue>(Ptr))
- continue;
-
- // Value comes from bitcast: Ptr = bitcast x. Insert x.
- if (auto *BCI = dyn_cast<BitCastInst>(Ptr))
- TryInsertToQueue(BCI->getOperand(0));
- // Gep with zeros is equivalent to bitcast.
- // FIXME: we are not sure if some bitcast should be canonicalized to gep 0
- // or gep 0 to bitcast because of SROA, so there are 2 forms. When typeless
- // pointers will be upstream then both cases will be gone (and this BFS
- // also won't be needed).
- if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr))
- if (GEP->hasAllZeroIndices())
- TryInsertToQueue(GEP->getOperand(0));
+ assert(Ptr && !isa<GlobalValue>(Ptr) &&
+ "Null or GlobalValue should not be inserted");
for (const Use &Us : Ptr->uses()) {
auto *U = dyn_cast<Instruction>(Us.getUser());
@@ -385,13 +371,17 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
// Bitcast or gep with zeros are using Ptr. Add to queue to check it's
// users. U = bitcast Ptr
if (isa<BitCastInst>(U)) {
- TryInsertToQueue(U);
+ LoadOperandsQueue.push_back(U);
continue;
}
- // U = getelementptr Ptr, 0, 0...
+ // Gep with zeros is equivalent to bitcast.
+ // FIXME: we are not sure if some bitcast should be canonicalized to gep 0
+ // or gep 0 to bitcast because of SROA, so there are 2 forms. When
+ // typeless pointers will be ready then both cases will be gone
+ // (and this BFS also won't be needed).
if (auto *GEP = dyn_cast<GetElementPtrInst>(U))
if (GEP->hasAllZeroIndices()) {
- TryInsertToQueue(U);
+ LoadOperandsQueue.push_back(U);
continue;
}
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 5e566bcdaff4..44f1a6dde0d2 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -10012,6 +10012,18 @@ void ScalarEvolution::verify() const {
// TODO: Verify more things.
}
+bool ScalarEvolution::invalidate(
+ Function &F, const PreservedAnalyses &PA,
+ FunctionAnalysisManager::Invalidator &Inv) {
+ // Invalidate the ScalarEvolution object whenever it isn't preserved or one
+ // of its dependencies is invalidated.
+ auto PAC = PA.getChecker<ScalarEvolutionAnalysis>();
+ return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>()) ||
+ Inv.invalidate<AssumptionAnalysis>(F, PA) ||
+ Inv.invalidate<DominatorTreeAnalysis>(F, PA) ||
+ Inv.invalidate<LoopAnalysis>(F, PA);
+}
+
AnalysisKey ScalarEvolutionAnalysis::Key;
ScalarEvolution ScalarEvolutionAnalysis::run(Function &F,
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 073b4e6ab26a..d31472c0d33c 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -3257,6 +3257,7 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
case Intrinsic::dbg_value:
return true;
+ case Intrinsic::bitreverse:
case Intrinsic::bswap:
case Intrinsic::ctlz:
case Intrinsic::ctpop:
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index 460d39cc28d8..4a5d18e2db75 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -429,7 +429,7 @@ class MetadataLoader::MetadataLoaderImpl {
/// Populate the index above to enable lazily loading of metadata, and load
/// the named metadata as well as the transitively referenced global
/// Metadata.
- Expected<bool> lazyLoadModuleMetadataBlock(PlaceholderQueue &Placeholders);
+ Expected<bool> lazyLoadModuleMetadataBlock();
/// On-demand loading of a single metadata. Requires the index above to be
/// populated.
@@ -516,8 +516,8 @@ Error error(const Twine &Message) {
Message, make_error_code(BitcodeError::CorruptedBitcode));
}
-Expected<bool> MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock(
- PlaceholderQueue &Placeholders) {
+Expected<bool>
+MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
IndexCursor = Stream;
SmallVector<uint64_t, 64> Record;
// Get the abbrevs, and preload record positions to make them lazy-loadable.
@@ -701,7 +701,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
// then load individual record as needed, starting with the named metadata.
if (ModuleLevel && IsImporting && MetadataList.empty() &&
!DisableLazyLoading) {
- auto SuccessOrErr = lazyLoadModuleMetadataBlock(Placeholders);
+ auto SuccessOrErr = lazyLoadModuleMetadataBlock();
if (!SuccessOrErr)
return SuccessOrErr.takeError();
if (SuccessOrErr.get()) {
@@ -1561,7 +1561,6 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
return error("Invalid record");
SmallVector<uint64_t, 64> Record;
-
PlaceholderQueue Placeholders;
while (true) {
@@ -1608,10 +1607,12 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
auto Idx = Record[i + 1];
if (Idx < (MDStringRef.size() + GlobalMetadataBitPosIndex.size()) &&
- !MetadataList.lookup(Idx))
+ !MetadataList.lookup(Idx)) {
// Load the attachment if it is in the lazy-loadable range and hasn't
// been loaded yet.
lazyLoadOneMetadata(Idx, Placeholders);
+ resolveForwardRefsAndPlaceholders(Placeholders);
+ }
Metadata *Node = MetadataList.getMetadataFwdRef(Idx);
if (isa<LocalAsMetadata>(Node))
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a37f4e1116b4..6b62f11f1240 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1714,7 +1714,7 @@ void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N,
EVT CCT = getSetCCResultType(NVT);
// Hi part is always the same op
- Hi = DAG.getNode(N->getOpcode(), DL, {NVT, NVT}, {LHSH, RHSH});
+ Hi = DAG.getNode(N->getOpcode(), DL, NVT, {LHSH, RHSH});
// We need to know whether to select Lo part that corresponds to 'winning'
// Hi part or if Hi parts are equal.
@@ -1725,7 +1725,7 @@ void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N,
SDValue LoCmp = DAG.getSelect(DL, NVT, IsHiLeft, LHSL, RHSL);
// Recursed Lo part if Hi parts are equal, this uses unsigned version
- SDValue LoMinMax = DAG.getNode(LoOpc, DL, {NVT, NVT}, {LHSL, RHSL});
+ SDValue LoMinMax = DAG.getNode(LoOpc, DL, NVT, {LHSL, RHSL});
Lo = DAG.getSelect(DL, NVT, IsHiEq, LoMinMax, LoCmp);
}
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index bae828a2263c..234b2043a6a1 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -381,7 +381,6 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
I != E; ++I) {
if (DCELimit != -1 && (int)NumDead >= DCELimit)
break;
-
int FirstSS, SecondSS;
if (TII->isStackSlotCopy(*I, FirstSS, SecondSS) && FirstSS == SecondSS &&
FirstSS != -1) {
@@ -392,12 +391,18 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
}
MachineBasicBlock::iterator NextMI = std::next(I);
- if (NextMI == MBB->end()) continue;
+ MachineBasicBlock::iterator ProbableLoadMI = I;
unsigned LoadReg = 0;
unsigned StoreReg = 0;
if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS)))
continue;
+ // Skip the ...pseudo debugging... instructions between a load and store.
+ while ((NextMI != E) && NextMI->isDebugValue()) {
+ ++NextMI;
+ ++I;
+ }
+ if (NextMI == E) continue;
if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS)))
continue;
if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue;
@@ -407,7 +412,7 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
if (NextMI->findRegisterUseOperandIdx(LoadReg, true, nullptr) != -1) {
++NumDead;
- toErase.push_back(&*I);
+ toErase.push_back(&*ProbableLoadMI);
}
toErase.push_back(&*NextMI);
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index a5a30fab5b69..8f6b1849169a 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -896,6 +896,48 @@ uint32_t RuntimeDyldELF::getMatchingLoRelocation(uint32_t RelType,
return ELF::R_MIPS_NONE;
}
+// Sometimes we don't need to create thunk for a branch.
+// This typically happens when branch target is located
+// in the same object file. In such case target is either
+// a weak symbol or symbol in a different executable section.
+// This function checks if branch target is located in the
+// same object file and if distance between source and target
+// fits R_AARCH64_CALL26 relocation. If both conditions are
+// met, it emits direct jump to the target and returns true.
+// Otherwise false is returned and thunk is created.
+bool RuntimeDyldELF::resolveAArch64ShortBranch(
+ unsigned SectionID, relocation_iterator RelI,
+ const RelocationValueRef &Value) {
+ uint64_t Address;
+ if (Value.SymbolName) {
+ auto Loc = GlobalSymbolTable.find(Value.SymbolName);
+
+ // Don't create direct branch for external symbols.
+ if (Loc == GlobalSymbolTable.end())
+ return false;
+
+ const auto &SymInfo = Loc->second;
+ Address =
+ uint64_t(Sections[SymInfo.getSectionID()].getLoadAddressWithOffset(
+ SymInfo.getOffset()));
+ } else {
+ Address = uint64_t(Sections[Value.SectionID].getLoadAddress());
+ }
+ uint64_t Offset = RelI->getOffset();
+ uint64_t SourceAddress = Sections[SectionID].getLoadAddressWithOffset(Offset);
+
+ // R_AARCH64_CALL26 requires immediate to be in range -2^27 <= imm < 2^27
+ // If distance between source and target is out of range then we should
+ // create thunk.
+ if (!isInt<28>(Address + Value.Addend - SourceAddress))
+ return false;
+
+ resolveRelocation(Sections[SectionID], Offset, Address, RelI->getType(),
+ Value.Addend);
+
+ return true;
+}
+
Expected<relocation_iterator>
RuntimeDyldELF::processRelocationRef(
unsigned SectionID, relocation_iterator RelI, const ObjectFile &O,
@@ -1003,7 +1045,7 @@ RuntimeDyldELF::processRelocationRef(
(uint64_t)Section.getAddressWithOffset(i->second),
RelType, 0);
DEBUG(dbgs() << " Stub function found\n");
- } else {
+ } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) {
// Create a new stub function.
DEBUG(dbgs() << " Create a new stub function\n");
Stubs[Value] = Section.getStubOffset();
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 796127ab92bd..d1867d091fe2 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -40,6 +40,9 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
void resolveAArch64Relocation(const SectionEntry &Section, uint64_t Offset,
uint64_t Value, uint32_t Type, int64_t Addend);
+ bool resolveAArch64ShortBranch(unsigned SectionID, relocation_iterator RelI,
+ const RelocationValueRef &Value);
+
void resolveARMRelocation(const SectionEntry &Section, uint64_t Offset,
uint32_t Value, uint32_t Type, int32_t Addend);
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index 66ffe6db29d6..928f69a17de9 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -196,8 +196,15 @@ crossImportIntoModule(Module &TheModule, const ModuleSummaryIndex &Index,
};
FunctionImporter Importer(Index, Loader);
- if (!Importer.importFunctions(TheModule, ImportList))
+ Expected<bool> Result = Importer.importFunctions(TheModule, ImportList);
+ if (!Result) {
+ handleAllErrors(Result.takeError(), [&](ErrorInfoBase &EIB) {
+ SMDiagnostic Err = SMDiagnostic(TheModule.getModuleIdentifier(),
+ SourceMgr::DK_Error, EIB.message());
+ Err.print("ThinLTO", errs());
+ });
report_fatal_error("importFunctions failed");
+ }
}
static void optimizeModule(Module &TheModule, TargetMachine &TM,
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 40105000c56c..5b018676eba3 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -2823,7 +2823,11 @@ StringRef MachORebaseEntry::typeName() const {
}
bool MachORebaseEntry::operator==(const MachORebaseEntry &Other) const {
+#ifdef EXPENSIVE_CHECKS
assert(Opcodes == Other.Opcodes && "compare iterators of different files");
+#else
+ assert(Opcodes.data() == Other.Opcodes.data() && "compare iterators of different files");
+#endif
return (Ptr == Other.Ptr) &&
(RemainingLoopCount == Other.RemainingLoopCount) &&
(Done == Other.Done);
@@ -3073,7 +3077,11 @@ uint32_t MachOBindEntry::flags() const { return Flags; }
int MachOBindEntry::ordinal() const { return Ordinal; }
bool MachOBindEntry::operator==(const MachOBindEntry &Other) const {
+#ifdef EXPENSIVE_CHECKS
assert(Opcodes == Other.Opcodes && "compare iterators of different files");
+#else
+ assert(Opcodes.data() == Other.Opcodes.data() && "compare iterators of different files");
+#endif
return (Ptr == Other.Ptr) &&
(RemainingLoopCount == Other.RemainingLoopCount) &&
(Done == Other.Done);
diff --git a/lib/Object/ModuleSummaryIndexObjectFile.cpp b/lib/Object/ModuleSummaryIndexObjectFile.cpp
index 202783e7d993..11ace84b9ceb 100644
--- a/lib/Object/ModuleSummaryIndexObjectFile.cpp
+++ b/lib/Object/ModuleSummaryIndexObjectFile.cpp
@@ -22,6 +22,12 @@
using namespace llvm;
using namespace object;
+static llvm::cl::opt<bool> IgnoreEmptyThinLTOIndexFile(
+ "ignore-empty-index-file", llvm::cl::ZeroOrMore,
+ llvm::cl::desc(
+ "Ignore an empty index file and perform non-ThinLTO compilation"),
+ llvm::cl::init(false));
+
ModuleSummaryIndexObjectFile::ModuleSummaryIndexObjectFile(
MemoryBufferRef Object, std::unique_ptr<ModuleSummaryIndex> I)
: SymbolicFile(Binary::ID_ModuleSummaryIndex, Object), Index(std::move(I)) {
@@ -97,6 +103,8 @@ llvm::getModuleSummaryIndexForFile(StringRef Path) {
if (EC)
return errorCodeToError(EC);
MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef();
+ if (IgnoreEmptyThinLTOIndexFile && !BufferRef.getBufferSize())
+ return nullptr;
Expected<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
object::ModuleSummaryIndexObjectFile::create(BufferRef);
if (!ObjOrErr)
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 0a989706b436..3889902eea54 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -373,7 +373,7 @@ void Option::removeArgument() { GlobalParser->removeOption(this); }
void Option::setArgStr(StringRef S) {
if (FullyInitialized)
GlobalParser->updateArgStr(this, S);
- assert(S[0] != '-' && "Option can't start with '-");
+ assert((S.empty() || S[0] != '-') && "Option can't start with '-");
ArgStr = S;
}
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index 0616d05aff57..4bb035eeccca 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -571,6 +571,16 @@ void native(SmallVectorImpl<char> &Path) {
#endif
}
+std::string convert_to_slash(StringRef path) {
+#ifdef LLVM_ON_WIN32
+ std::string s = path.str();
+ std::replace(s.begin(), s.end(), '\\', '/');
+ return s;
+#else
+ return path;
+#endif
+}
+
StringRef filename(StringRef path) {
return *rbegin(path);
}
diff --git a/lib/Support/TarWriter.cpp b/lib/Support/TarWriter.cpp
index 5fc17d276377..f79b364dc1f7 100644
--- a/lib/Support/TarWriter.cpp
+++ b/lib/Support/TarWriter.cpp
@@ -26,6 +26,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Path.h"
using namespace llvm;
@@ -109,27 +110,44 @@ static void writePaxHeader(raw_fd_ostream &OS, StringRef Path) {
pad(OS);
}
+// In the Ustar header, a path can be split at any '/' to store
+// a path into UstarHeader::Name and UstarHeader::Prefix. This
+// function splits a given path for that purpose.
+static std::pair<StringRef, StringRef> splitPath(StringRef Path) {
+ if (Path.size() <= sizeof(UstarHeader::Name))
+ return {"", Path};
+ size_t Sep = Path.rfind('/', sizeof(UstarHeader::Name) + 1);
+ if (Sep == StringRef::npos)
+ return {"", Path};
+ return {Path.substr(0, Sep), Path.substr(Sep + 1)};
+}
+
+// Returns true if a given path can be stored to a Ustar header
+// without the PAX extension.
+static bool fitsInUstar(StringRef Path) {
+ StringRef Prefix;
+ StringRef Name;
+ std::tie(Prefix, Name) = splitPath(Path);
+ return Name.size() <= sizeof(UstarHeader::Name);
+}
+
// The PAX header is an extended format, so a PAX header needs
// to be followed by a "real" header.
static void writeUstarHeader(raw_fd_ostream &OS, StringRef Path, size_t Size) {
+ StringRef Prefix;
+ StringRef Name;
+ std::tie(Prefix, Name) = splitPath(Path);
+
UstarHeader Hdr = {};
- memcpy(Hdr.Name, Path.data(), Path.size());
+ memcpy(Hdr.Name, Name.data(), Name.size());
memcpy(Hdr.Mode, "0000664", 8);
snprintf(Hdr.Size, sizeof(Hdr.Size), "%011zo", Size);
memcpy(Hdr.Magic, "ustar", 6);
+ memcpy(Hdr.Prefix, Prefix.data(), Prefix.size());
computeChecksum(Hdr);
OS << StringRef(reinterpret_cast<char *>(&Hdr), sizeof(Hdr));
}
-// We want to use '/' as a path separator even on Windows.
-// This function canonicalizes a given path.
-static std::string canonicalize(std::string S) {
-#ifdef LLVM_ON_WIN32
- std::replace(S.begin(), S.end(), '\\', '/');
-#endif
- return S;
-}
-
// Creates a TarWriter instance and returns it.
Expected<std::unique_ptr<TarWriter>> TarWriter::create(StringRef OutputPath,
StringRef BaseDir) {
@@ -145,8 +163,8 @@ TarWriter::TarWriter(int FD, StringRef BaseDir)
// Append a given file to an archive.
void TarWriter::append(StringRef Path, StringRef Data) {
// Write Path and Data.
- std::string S = BaseDir + "/" + canonicalize(Path) + "\0";
- if (S.size() <= sizeof(UstarHeader::Name)) {
+ std::string S = BaseDir + "/" + sys::path::convert_to_slash(Path) + "\0";
+ if (fitsInUstar(S)) {
writeUstarHeader(OS, S, Data.size());
} else {
writePaxHeader(OS, S);
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ef3b44f7c211..2b4fc5397b18 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -608,6 +608,10 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
if ((C = dyn_cast<ConstantSDNode>(Addr))) {
Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
+ (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
+ Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+ Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
(C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
Base = Addr.getOperand(0);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0b0a0e7d083e..730bcdcf7afa 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -172,16 +172,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2f64, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
- setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
- setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
-
- setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
- setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
-
- setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
- setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
- setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
-
setTruncStoreAction(MVT::i64, MVT::i1, Expand);
setTruncStoreAction(MVT::i64, MVT::i8, Expand);
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index a6c31629e7c4..da9d009c542b 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -822,6 +822,7 @@ public:
bool isForcedVOP3() const { return ForcedEncodingSize == 64; }
bool isForcedDPP() const { return ForcedDPP; }
bool isForcedSDWA() const { return ForcedSDWA; }
+ ArrayRef<unsigned> getMatchedVariants() const;
std::unique_ptr<AMDGPUOperand> parseRegister();
bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
@@ -1630,31 +1631,44 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
return Match_Success;
}
+// What asm variants we should check
+ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
+ if (getForcedEncodingSize() == 32) {
+ static const unsigned Variants[] = {AMDGPUAsmVariants::DEFAULT};
+ return makeArrayRef(Variants);
+ }
+
+ if (isForcedVOP3()) {
+ static const unsigned Variants[] = {AMDGPUAsmVariants::VOP3};
+ return makeArrayRef(Variants);
+ }
+
+ if (isForcedSDWA()) {
+ static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA};
+ return makeArrayRef(Variants);
+ }
+
+ if (isForcedDPP()) {
+ static const unsigned Variants[] = {AMDGPUAsmVariants::DPP};
+ return makeArrayRef(Variants);
+ }
+
+ static const unsigned Variants[] = {
+ AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,
+ AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP
+ };
+
+ return makeArrayRef(Variants);
+}
+
bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
uint64_t &ErrorInfo,
bool MatchingInlineAsm) {
- // What asm variants we should check
- std::vector<unsigned> MatchedVariants;
- if (getForcedEncodingSize() == 32) {
- MatchedVariants = {AMDGPUAsmVariants::DEFAULT};
- } else if (isForcedVOP3()) {
- MatchedVariants = {AMDGPUAsmVariants::VOP3};
- } else if (isForcedSDWA()) {
- MatchedVariants = {AMDGPUAsmVariants::SDWA};
- } else if (isForcedDPP()) {
- MatchedVariants = {AMDGPUAsmVariants::DPP};
- } else {
- MatchedVariants = {AMDGPUAsmVariants::DEFAULT,
- AMDGPUAsmVariants::VOP3,
- AMDGPUAsmVariants::SDWA,
- AMDGPUAsmVariants::DPP};
- }
-
MCInst Inst;
unsigned Result = Match_Success;
- for (auto Variant : MatchedVariants) {
+ for (auto Variant : getMatchedVariants()) {
uint64_t EI;
auto R = MatchInstructionImpl(Operands, Inst, EI, MatchingInlineAsm,
Variant);
@@ -3486,7 +3500,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
for (unsigned E = Operands.size(); I != E; ++I) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
// Add the register arguments
- if ((BasicInstType == SIInstrFlags::VOPC ||
+ if ((BasicInstType == SIInstrFlags::VOPC ||
BasicInstType == SIInstrFlags::VOP2)&&
Op.isReg() &&
Op.Reg.RegNo == AMDGPU::VCC) {
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 89c9266746ac..de7ce5cb9e47 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -99,6 +99,18 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::i32, MVT::i8, Custom);
setTruncStoreAction(MVT::i32, MVT::i16, Custom);
+ // We need to include these since trunc STORES to PRIVATE need
+ // special handling to accommodate RMW
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Custom);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i16, Custom);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i16, Custom);
+ setTruncStoreAction(MVT::v32i32, MVT::v32i16, Custom);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i8, Custom);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Custom);
+ setTruncStoreAction(MVT::v32i32, MVT::v32i8, Custom);
// Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
@@ -1087,79 +1099,114 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth,
SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
SelectionDAG &DAG) const {
SDLoc DL(Store);
+ //TODO: Who creates the i8 stores?
+ assert(Store->isTruncatingStore()
+ || Store->getValue().getValueType() == MVT::i8);
+ assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS);
- unsigned Mask = 0;
+ SDValue Mask;
if (Store->getMemoryVT() == MVT::i8) {
- Mask = 0xff;
+ assert(Store->getAlignment() >= 1);
+ Mask = DAG.getConstant(0xff, DL, MVT::i32);
} else if (Store->getMemoryVT() == MVT::i16) {
- Mask = 0xffff;
+ assert(Store->getAlignment() >= 2);
+ Mask = DAG.getConstant(0xffff, DL, MVT::i32);;
+ } else {
+ llvm_unreachable("Unsupported private trunc store");
}
SDValue Chain = Store->getChain();
SDValue BasePtr = Store->getBasePtr();
+ SDValue Offset = Store->getOffset();
EVT MemVT = Store->getMemoryVT();
- SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
- DAG.getConstant(2, DL, MVT::i32));
- SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
- Chain, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32));
+ SDValue LoadPtr = BasePtr;
+ if (!Offset.isUndef()) {
+ LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
+ }
+
+ // Get dword location
+ // TODO: this should be eliminated by the future SHR ptr, 2
+ SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
+ DAG.getConstant(0xfffffffc, DL, MVT::i32));
+
+ // Load dword
+ // TODO: can we be smarter about machine pointer info?
+ SDValue Dst = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo());
- SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
+ Chain = Dst.getValue(1);
+
+ // Get offset in dword
+ SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
DAG.getConstant(0x3, DL, MVT::i32));
+ // Convert byte offset to bit shift
SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
DAG.getConstant(3, DL, MVT::i32));
+ // TODO: Contrary to the name of the functiom,
+ // it also handles sub i32 non-truncating stores (like i1)
SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
Store->getValue());
+ // Mask the value to the right type
SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
+ // Shift the value in place
SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
MaskedValue, ShiftAmt);
- SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
- DAG.getConstant(Mask, DL, MVT::i32),
- ShiftAmt);
- DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
- DAG.getConstant(0xffffffff, DL, MVT::i32));
+ // Shift the mask in place
+ SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, Mask, ShiftAmt);
+
+ // Invert the mask. NOTE: if we had native ROL instructions we could
+ // use inverted mask
+ DstMask = DAG.getNOT(DL, DstMask, MVT::i32);
+
+ // Cleanup the target bits
Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
+ // Add the new bits
SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
- return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
- Chain, Value, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32));
+
+ // Store dword
+ // TODO: Can we be smarter about MachinePointerInfo?
+ return DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo());
}
SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
unsigned AS = StoreNode->getAddressSpace();
+
+ SDValue Chain = StoreNode->getChain();
+ SDValue Ptr = StoreNode->getBasePtr();
SDValue Value = StoreNode->getValue();
- EVT ValueVT = Value.getValueType();
+
+ EVT VT = Value.getValueType();
EVT MemVT = StoreNode->getMemoryVT();
- unsigned Align = StoreNode->getAlignment();
+ EVT PtrVT = Ptr.getValueType();
+ SDLoc DL(Op);
+
+ // Neither LOCAL nor PRIVATE can do vectors at the moment
if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
- ValueVT.isVector()) {
- return SplitVectorStore(Op, DAG);
+ VT.isVector()) {
+ return scalarizeVectorStore(StoreNode, DAG);
}
- // Private AS needs special fixes
- if (Align < MemVT.getStoreSize() && (AS != AMDGPUAS::PRIVATE_ADDRESS) &&
+ unsigned Align = StoreNode->getAlignment();
+ if (Align < MemVT.getStoreSize() &&
!allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
return expandUnalignedStore(StoreNode, DAG);
}
- SDLoc DL(Op);
- SDValue Chain = StoreNode->getChain();
- SDValue Ptr = StoreNode->getBasePtr();
+ SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
+ DAG.getConstant(2, DL, PtrVT));
if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
// It is beneficial to create MSKOR here instead of combiner to avoid
// artificial dependencies introduced by RMW
if (StoreNode->isTruncatingStore()) {
- EVT VT = Value.getValueType();
assert(VT.bitsLE(MVT::i32));
SDValue MaskConstant;
if (MemVT == MVT::i8) {
@@ -1169,15 +1216,19 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
assert(StoreNode->getAlignment() >= 2);
MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
}
- SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
- DAG.getConstant(2, DL, MVT::i32));
- SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
- DAG.getConstant(0x00000003, DL, VT));
+
+ SDValue ByteIndex = DAG.getNode(ISD::AND, DL, PtrVT, Ptr,
+ DAG.getConstant(0x00000003, DL, PtrVT));
+ SDValue BitShift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
+ DAG.getConstant(3, DL, VT));
+
+ // Put the mask in correct place
+ SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift);
+
+ // Put the mask in correct place
SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
- SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
- DAG.getConstant(3, DL, VT));
- SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
- SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
+ SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift);
+
// XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
// vector instead.
SDValue Src[4] = {
@@ -1191,12 +1242,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
Op->getVTList(), Args, MemVT,
StoreNode->getMemOperand());
- } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
- ValueVT.bitsGE(MVT::i32)) {
+ } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && VT.bitsGE(MVT::i32)) {
// Convert pointer from byte address to dword address.
- Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
- DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
- Ptr, DAG.getConstant(2, DL, MVT::i32)));
+ Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
llvm_unreachable("Truncated and indexed stores not supported yet");
@@ -1207,49 +1255,22 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
}
}
+ // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
if (AS != AMDGPUAS::PRIVATE_ADDRESS)
return SDValue();
if (MemVT.bitsLT(MVT::i32))
return lowerPrivateTruncStore(StoreNode, DAG);
- // Lowering for indirect addressing
- const MachineFunction &MF = DAG.getMachineFunction();
- const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
- unsigned StackWidth = TFL->getStackWidth(MF);
-
- Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
-
- if (ValueVT.isVector()) {
- unsigned NumElemVT = ValueVT.getVectorNumElements();
- EVT ElemVT = ValueVT.getVectorElementType();
- SmallVector<SDValue, 4> Stores(NumElemVT);
-
- assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
- "vector width in load");
-
- for (unsigned i = 0; i < NumElemVT; ++i) {
- unsigned Channel, PtrIncr;
- getStackAddress(StackWidth, i, Channel, PtrIncr);
- Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
- DAG.getConstant(PtrIncr, DL, MVT::i32));
- SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
- Value, DAG.getConstant(i, DL, MVT::i32));
-
- Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
- Chain, Elem, Ptr,
- DAG.getTargetConstant(Channel, DL, MVT::i32));
- }
- Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
- } else {
- if (ValueVT == MVT::i8) {
- Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
- }
- Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
+ // Standard i32+ store, tag it with DWORDADDR to note that the address
+ // has been shifted
+ if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
+ Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, PtrVT, DWordAddr);
+ return DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
}
- return Chain;
+ // Tagged i32+ stores will be matched by patterns
+ return SDValue();
}
// return (512 + (kc_bank << 12)
@@ -1299,51 +1320,50 @@ SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
LoadSDNode *Load = cast<LoadSDNode>(Op);
ISD::LoadExtType ExtType = Load->getExtensionType();
EVT MemVT = Load->getMemoryVT();
+ assert(Load->getAlignment() >= MemVT.getStoreSize());
- // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
- // register (2-)byte extract.
+ SDValue BasePtr = Load->getBasePtr();
+ SDValue Chain = Load->getChain();
+ SDValue Offset = Load->getOffset();
- // Get Register holding the target.
- SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
- DAG.getConstant(2, DL, MVT::i32));
- // Load the Register.
- SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
- Load->getChain(),
- Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32),
- Op.getOperand(2));
+ SDValue LoadPtr = BasePtr;
+ if (!Offset.isUndef()) {
+ LoadPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, Offset);
+ }
+
+ // Get dword location
+ // NOTE: this should be eliminated by the future SHR ptr, 2
+ SDValue Ptr = DAG.getNode(ISD::AND, DL, MVT::i32, LoadPtr,
+ DAG.getConstant(0xfffffffc, DL, MVT::i32));
+
+ // Load dword
+ // TODO: can we be smarter about machine pointer info?
+ SDValue Read = DAG.getLoad(MVT::i32, DL, Chain, Ptr, MachinePointerInfo());
// Get offset within the register.
SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
- Load->getBasePtr(),
- DAG.getConstant(0x3, DL, MVT::i32));
+ LoadPtr, DAG.getConstant(0x3, DL, MVT::i32));
// Bit offset of target byte (byteIdx * 8).
SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
DAG.getConstant(3, DL, MVT::i32));
// Shift to the right.
- Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
+ SDValue Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Read, ShiftAmt);
// Eliminate the upper bits by setting them to ...
EVT MemEltVT = MemVT.getScalarType();
- // ... ones.
- if (ExtType == ISD::SEXTLOAD) {
+ if (ExtType == ISD::SEXTLOAD) { // ... ones.
SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
-
- SDValue Ops[] = {
- DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
- Load->getChain()
- };
-
- return DAG.getMergeValues(Ops, DL);
+ Ret = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode);
+ } else { // ... or zeros.
+ Ret = DAG.getZeroExtendInReg(Ret, DL, MemEltVT);
}
- // ... or zeros.
SDValue Ops[] = {
- DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
- Load->getChain()
+ Ret,
+ Read.getValue(1) // This should be our output chain
};
return DAG.getMergeValues(Ops, DL);
@@ -1365,12 +1385,10 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = LoadNode->getChain();
SDValue Ptr = LoadNode->getBasePtr();
- if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
- SDValue MergedValues[2] = {
- scalarizeVectorLoad(LoadNode, DAG),
- Chain
- };
- return DAG.getMergeValues(MergedValues, DL);
+ if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+ VT.isVector()) {
+ return scalarizeVectorLoad(LoadNode, DAG);
}
int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
@@ -1421,8 +1439,6 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return DAG.getMergeValues(MergedValues, DL);
}
- SDValue LoweredLoad;
-
// For most operations returning SDValue() will result in the node being
// expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
// need to manually expand loads that may be legal in some address spaces and
@@ -1447,47 +1463,14 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
- // Lowering for indirect addressing
- const MachineFunction &MF = DAG.getMachineFunction();
- const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
- unsigned StackWidth = TFL->getStackWidth(MF);
-
- Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
-
- if (VT.isVector()) {
- unsigned NumElemVT = VT.getVectorNumElements();
- EVT ElemVT = VT.getVectorElementType();
- SDValue Loads[4];
-
- assert(NumElemVT <= 4);
- assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
- "vector width in load");
-
- for (unsigned i = 0; i < NumElemVT; ++i) {
- unsigned Channel, PtrIncr;
- getStackAddress(StackWidth, i, Channel, PtrIncr);
- Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
- DAG.getConstant(PtrIncr, DL, MVT::i32));
- Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
- Chain, Ptr,
- DAG.getTargetConstant(Channel, DL, MVT::i32),
- Op.getOperand(2));
- }
- EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
- LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
- } else {
- LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
- Chain, Ptr,
- DAG.getTargetConstant(0, DL, MVT::i32), // Channel
- Op.getOperand(2));
+ // DWORDADDR ISD marks already shifted address
+ if (Ptr.getOpcode() != AMDGPUISD::DWORDADDR) {
+ assert(VT == MVT::i32);
+ Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(2, DL, MVT::i32));
+ Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, MVT::i32, Ptr);
+ return DAG.getLoad(MVT::i32, DL, Chain, Ptr, LoadNode->getMemOperand());
}
-
- SDValue Ops[2] = {
- LoweredLoad,
- Chain
- };
-
- return DAG.getMergeValues(Ops, DL);
+ return SDValue();
}
SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 3a72e0791fd6..19795bdde647 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -1268,6 +1268,17 @@ let Predicates = [isR600] in {
defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
+// Hardcode channel to 0
+// NOTE: LSHR is not available here. LSHR is per family instruction
+def : Pat <
+ (i32 (load_private ADDRIndirect:$addr) ),
+ (R600_RegisterLoad FRAMEri:$addr, (i32 0))
+>;
+def : Pat <
+ (store_private i32:$val, ADDRIndirect:$addr),
+ (R600_RegisterStore i32:$val, FRAMEri:$addr, (i32 0))
+>;
+
//===----------------------------------------------------------------------===//
// Pseudo instructions
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index c78e97dfd46f..9140fe6cd148 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -99,6 +99,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+ setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
+ setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
+ setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
+
+
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
@@ -699,7 +711,8 @@ SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
const SDLoc &SL, SDValue Chain,
- unsigned Offset, bool Signed) const {
+ unsigned Offset, bool Signed,
+ const ISD::InputArg *Arg) const {
const DataLayout &DL = DAG.getDataLayout();
Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
@@ -713,20 +726,21 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant);
- SDValue Val;
+ SDValue Val = Load;
+ if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
+ VT.bitsLT(MemVT)) {
+ unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
+ Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
+ }
+
if (MemVT.isFloatingPoint())
- Val = getFPExtOrFPTrunc(DAG, Load, SL, VT);
+ Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
else if (Signed)
- Val = DAG.getSExtOrTrunc(Load, SL, VT);
+ Val = DAG.getSExtOrTrunc(Val, SL, VT);
else
- Val = DAG.getZExtOrTrunc(Load, SL, VT);
-
- SDValue Ops[] = {
- Val,
- Load.getValue(1)
- };
+ Val = DAG.getZExtOrTrunc(Val, SL, VT);
- return DAG.getMergeValues(Ops, SL);
+ return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
}
SDValue SITargetLowering::LowerFormalArguments(
@@ -899,7 +913,8 @@ SDValue SITargetLowering::LowerFormalArguments(
// The first 36 bytes of the input buffer contains information about
// thread group and global sizes.
SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain,
- Offset, Ins[i].Flags.isSExt());
+ Offset, Ins[i].Flags.isSExt(),
+ &Ins[i]);
Chains.push_back(Arg.getValue(1));
auto *ParamTy =
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 9583f6db6faa..6c04e4f30977 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -24,7 +24,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain,
unsigned Offset) const;
SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
- SDValue Chain, unsigned Offset, bool Signed) const;
+ SDValue Chain, unsigned Offset, bool Signed,
+ const ISD::InputArg *Arg = nullptr) const;
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
diff --git a/lib/Target/AVR/AVRISelDAGToDAG.cpp b/lib/Target/AVR/AVRISelDAGToDAG.cpp
index 156a21dfecfe..462a7d57d2de 100644
--- a/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -203,8 +203,8 @@ unsigned AVRDAGToDAGISel::selectIndexedProgMemLoad(const LoadSDNode *LD,
bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintCode,
std::vector<SDValue> &OutOps) {
- assert(ConstraintCode == InlineAsm::Constraint_m ||
- ConstraintCode == InlineAsm::Constraint_Q &&
+ assert((ConstraintCode == InlineAsm::Constraint_m ||
+ ConstraintCode == InlineAsm::Constraint_Q) &&
"Unexpected asm memory constraint");
MachineRegisterInfo &RI = MF->getRegInfo();
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index 53668f05b59b..07fc3f6890b8 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -14,6 +14,7 @@
#include "AVRISelLowering.h"
+#include "llvm/ADT/StringSwitch.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -1933,5 +1934,45 @@ void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
+unsigned AVRTargetLowering::getRegisterByName(const char *RegName,
+ EVT VT,
+ SelectionDAG &DAG) const {
+ unsigned Reg;
+
+ if (VT == MVT::i8) {
+ Reg = StringSwitch<unsigned>(RegName)
+ .Case("r0", AVR::R0).Case("r1", AVR::R1).Case("r2", AVR::R2)
+ .Case("r3", AVR::R3).Case("r4", AVR::R4).Case("r5", AVR::R5)
+ .Case("r6", AVR::R6).Case("r7", AVR::R7).Case("r8", AVR::R8)
+ .Case("r9", AVR::R9).Case("r10", AVR::R10).Case("r11", AVR::R11)
+ .Case("r12", AVR::R12).Case("r13", AVR::R13).Case("r14", AVR::R14)
+ .Case("r15", AVR::R15).Case("r16", AVR::R16).Case("r17", AVR::R17)
+ .Case("r18", AVR::R18).Case("r19", AVR::R19).Case("r20", AVR::R20)
+ .Case("r21", AVR::R21).Case("r22", AVR::R22).Case("r23", AVR::R23)
+ .Case("r24", AVR::R24).Case("r25", AVR::R25).Case("r26", AVR::R26)
+ .Case("r27", AVR::R27).Case("r28", AVR::R28).Case("r29", AVR::R29)
+ .Case("r30", AVR::R30).Case("r31", AVR::R31)
+ .Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30)
+ .Default(0);
+ } else {
+ Reg = StringSwitch<unsigned>(RegName)
+ .Case("r0", AVR::R1R0).Case("r2", AVR::R3R2)
+ .Case("r4", AVR::R5R4).Case("r6", AVR::R7R6)
+ .Case("r8", AVR::R9R8).Case("r10", AVR::R11R10)
+ .Case("r12", AVR::R13R12).Case("r14", AVR::R15R14)
+ .Case("r16", AVR::R17R16).Case("r18", AVR::R19R18)
+ .Case("r20", AVR::R21R20).Case("r22", AVR::R23R22)
+ .Case("r24", AVR::R25R24).Case("r26", AVR::R27R26)
+ .Case("r28", AVR::R29R28).Case("r30", AVR::R31R30)
+ .Case("X", AVR::R27R26).Case("Y", AVR::R29R28).Case("Z", AVR::R31R30)
+ .Default(0);
+ }
+
+ if (Reg)
+ return Reg;
+
+ report_fatal_error("Invalid register name global variable");
+}
+
} // end of namespace llvm
diff --git a/lib/Target/AVR/AVRISelLowering.h b/lib/Target/AVR/AVRISelLowering.h
index 17074e1b1eee..a8cdc4e7ae23 100644
--- a/lib/Target/AVR/AVRISelLowering.h
+++ b/lib/Target/AVR/AVRISelLowering.h
@@ -116,6 +116,9 @@ public:
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const override;
+ unsigned getRegisterByName(const char* RegName, EVT VT,
+ SelectionDAG &DAG) const override;
+
private:
SDValue getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AVRcc,
SelectionDAG &DAG, SDLoc dl) const;
diff --git a/lib/Target/BPF/BPFInstrInfo.cpp b/lib/Target/BPF/BPFInstrInfo.cpp
index cbe4466164f9..e38facead922 100644
--- a/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/lib/Target/BPF/BPFInstrInfo.cpp
@@ -13,15 +13,13 @@
#include "BPF.h"
#include "BPFInstrInfo.h"
-#include "BPFSubtarget.h"
-#include "BPFTargetMachine.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DebugLoc.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
+#include <cassert>
+#include <iterator>
#define GET_INSTRINFO_CTOR_DTOR
#include "BPFGenInstrInfo.inc"
@@ -109,11 +107,11 @@ bool BPFInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
while (std::next(I) != MBB.end())
std::next(I)->eraseFromParent();
Cond.clear();
- FBB = 0;
+ FBB = nullptr;
// Delete the J if it's equivalent to a fall-through.
if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
- TBB = 0;
+ TBB = nullptr;
I->eraseFromParent();
I = MBB.end();
continue;
diff --git a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index b0037fbc16ac..9beefcdcc1d5 100644
--- a/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -12,16 +12,15 @@
//===----------------------------------------------------------------------===//
#include "BPF.h"
-#include "BPFRegisterInfo.h"
#include "BPFSubtarget.h"
#include "MCTargetDesc/BPFMCTargetDesc.h"
-
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/TargetRegistry.h"
+#include <cstdint>
using namespace llvm;
@@ -36,14 +35,15 @@ class BPFDisassembler : public MCDisassembler {
public:
BPFDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
: MCDisassembler(STI, Ctx) {}
- virtual ~BPFDisassembler() {}
+ ~BPFDisassembler() override = default;
DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
raw_ostream &VStream,
raw_ostream &CStream) const override;
};
-}
+
+} // end anonymous namespace
static MCDisassembler *createBPFDisassembler(const Target &T,
const MCSubtargetInfo &STI,
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index a6cd2002c12c..afc321ea2c34 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -8,28 +8,24 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
using namespace llvm;
namespace {
+
class BPFAsmBackend : public MCAsmBackend {
public:
bool IsLittleEndian;
BPFAsmBackend(bool IsLittleEndian)
: MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
- ~BPFAsmBackend() override {}
+ ~BPFAsmBackend() override = default;
void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
uint64_t Value, bool IsPCRel) const override;
@@ -53,6 +49,8 @@ public:
bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
};
+} // end anonymous namespace
+
bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
if ((Count % 8) != 0)
return false;
@@ -66,7 +64,6 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
unsigned DataSize, uint64_t Value,
bool IsPCRel) const {
-
if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
assert(Value == 0);
} else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
@@ -92,7 +89,6 @@ void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
MCObjectWriter *BPFAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
return createBPFELFObjectWriter(OS, 0, IsLittleEndian);
}
-}
MCAsmBackend *llvm::createBPFAsmBackend(const Target &T,
const MCRegisterInfo &MRI,
diff --git a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
index 3d1c0eb55afa..ebe9abd8ffac 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp
@@ -10,29 +10,30 @@
#include "MCTargetDesc/BPFMCTargetDesc.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCFixup.h"
+#include "llvm/Support/ELF.h"
#include "llvm/Support/ErrorHandling.h"
+#include <cstdint>
using namespace llvm;
namespace {
+
class BPFELFObjectWriter : public MCELFObjectTargetWriter {
public:
BPFELFObjectWriter(uint8_t OSABI);
-
- ~BPFELFObjectWriter() override;
+ ~BPFELFObjectWriter() override = default;
protected:
unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
const MCFixup &Fixup, bool IsPCRel) const override;
};
-}
+
+} // end anonymous namespace
BPFELFObjectWriter::BPFELFObjectWriter(uint8_t OSABI)
: MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_BPF,
/*HasRelocationAddend*/ false) {}
-BPFELFObjectWriter::~BPFELFObjectWriter() {}
-
unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
const MCFixup &Fixup,
bool IsPCRel) const {
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
index 47f16512a397..e8c974479828 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -12,24 +12,25 @@
//===----------------------------------------------------------------------===//
#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Endian.h"
#include "llvm/Support/EndianStream.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
using namespace llvm;
#define DEBUG_TYPE "mccodeemitter"
namespace {
+
class BPFMCCodeEmitter : public MCCodeEmitter {
- BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete;
- void operator=(const BPFMCCodeEmitter &) = delete;
const MCInstrInfo &MCII;
const MCRegisterInfo &MRI;
bool IsLittleEndian;
@@ -38,8 +39,9 @@ public:
BPFMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
bool IsLittleEndian)
: MCII(mcii), MRI(mri), IsLittleEndian(IsLittleEndian) {}
-
- ~BPFMCCodeEmitter() {}
+ BPFMCCodeEmitter(const BPFMCCodeEmitter &) = delete;
+ void operator=(const BPFMCCodeEmitter &) = delete;
+ ~BPFMCCodeEmitter() override = default;
// getBinaryCodeForInstr - TableGen'erated function for getting the
// binary encoding for an instruction.
@@ -66,7 +68,8 @@ private:
void verifyInstructionPredicates(const MCInst &MI,
uint64_t AvailableFeatures) const;
};
-}
+
+} // end anonymous namespace
MCCodeEmitter *llvm::createBPFMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
diff --git a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
index 55415f97396b..b58409730de0 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -12,14 +12,13 @@
//===----------------------------------------------------------------------===//
#include "BPF.h"
-#include "BPFMCTargetDesc.h"
-#include "BPFMCAsmInfo.h"
#include "InstPrinter/BPFInstPrinter.h"
+#include "MCTargetDesc/BPFMCTargetDesc.h"
+#include "MCTargetDesc/BPFMCAsmInfo.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
#include "llvm/Support/TargetRegistry.h"
#define GET_INSTRINFO_MC_DESC
@@ -64,7 +63,7 @@ static MCInstPrinter *createBPFMCInstPrinter(const Triple &T,
const MCRegisterInfo &MRI) {
if (SyntaxVariant == 0)
return new BPFInstPrinter(MAI, MII, MRI);
- return 0;
+ return nullptr;
}
extern "C" void LLVMInitializeBPFTargetMC() {
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index 5fb5b0227800..df12e0e88e3b 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -101,7 +101,7 @@ LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T) {
}
LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
- const char* Triple, const char* CPU, const char* Features,
+ const char *Triple, const char *CPU, const char *Features,
LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
LLVMCodeModel CodeModel) {
Optional<Reloc::Model> RM;
@@ -139,7 +139,7 @@ LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
TargetOptions opt;
return wrap(unwrap(T)->createTargetMachine(Triple, CPU, Features, opt, RM,
- CM, OL));
+ CM, OL));
}
void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) { delete unwrap(T); }
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index f4d46383e5bb..d9c53ecc8d08 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_target(WebAssemblyCodeGen
WebAssemblyExplicitLocals.cpp
WebAssemblyFastISel.cpp
WebAssemblyFixIrreducibleControlFlow.cpp
+ WebAssemblyFixFunctionBitcasts.cpp
WebAssemblyFrameLowering.cpp
WebAssemblyISelDAGToDAG.cpp
WebAssemblyISelLowering.cpp
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index 09c35b4825fc..8738263ad847 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -28,6 +28,7 @@ class FunctionPass;
// LLVM IR passes.
ModulePass *createWebAssemblyLowerEmscriptenEHSjLj(bool DoEH, bool DoSjLj);
void initializeWebAssemblyLowerEmscriptenEHSjLjPass(PassRegistry &);
+ModulePass *createWebAssemblyFixFunctionBitcasts();
FunctionPass *createWebAssemblyOptimizeReturned();
// ISel and immediate followup passes.
diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
new file mode 100644
index 000000000000..d5474a02ce01
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -0,0 +1,159 @@
+//===-- WebAssemblyFixFunctionBitcasts.cpp - Fix function bitcasts --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief Fix bitcasted functions.
+///
+/// WebAssembly requires caller and callee signatures to match, however in LLVM,
+/// some amount of slop is vaguely permitted. Detect mismatch by looking for
+/// bitcasts of functions and rewrite them to use wrapper functions instead.
+///
+/// This doesn't catch all cases, such as when a function's address is taken in
+/// one place and casted in another, but it works for many common cases.
+///
+/// Note that LLVM already optimizes away function bitcasts in common cases by
+/// dropping arguments as needed, so this pass only ends up getting used in less
+/// common cases.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-fix-function-bitcasts"
+
+namespace {
+class FixFunctionBitcasts final : public ModulePass {
+ StringRef getPassName() const override {
+ return "WebAssembly Fix Function Bitcasts";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ ModulePass::getAnalysisUsage(AU);
+ }
+
+ bool runOnModule(Module &M) override;
+
+public:
+ static char ID;
+ FixFunctionBitcasts() : ModulePass(ID) {}
+};
+} // End anonymous namespace
+
+char FixFunctionBitcasts::ID = 0;
+ModulePass *llvm::createWebAssemblyFixFunctionBitcasts() {
+ return new FixFunctionBitcasts();
+}
+
+// Recursively descend the def-use lists from V to find non-bitcast users of
+// bitcasts of V.
+static void FindUses(Value *V, Function &F,
+ SmallVectorImpl<std::pair<Use *, Function *>> &Uses) {
+ for (Use &U : V->uses()) {
+ if (BitCastOperator *BC = dyn_cast<BitCastOperator>(U.getUser()))
+ FindUses(BC, F, Uses);
+ else if (U.get()->getType() != F.getType())
+ Uses.push_back(std::make_pair(&U, &F));
+ }
+}
+
+// Create a wrapper function with type Ty that calls F (which may have a
+// different type). Attempt to support common bitcasted function idioms:
+// - Call with more arguments than needed: arguments are dropped
+// - Call with fewer arguments than needed: arguments are filled in with undef
+// - Return value is not needed: drop it
+// - Return value needed but not present: supply an undef
+//
+// For now, return nullptr without creating a wrapper if the wrapper cannot
+// be generated due to incompatible types.
+static Function *CreateWrapper(Function *F, FunctionType *Ty) {
+ Module *M = F->getParent();
+
+ Function *Wrapper =
+ Function::Create(Ty, Function::PrivateLinkage, "bitcast", M);
+ BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
+
+ // Determine what arguments to pass.
+ SmallVector<Value *, 4> Args;
+ Function::arg_iterator AI = Wrapper->arg_begin();
+ FunctionType::param_iterator PI = F->getFunctionType()->param_begin();
+ FunctionType::param_iterator PE = F->getFunctionType()->param_end();
+ for (; AI != Wrapper->arg_end() && PI != PE; ++AI, ++PI) {
+ if (AI->getType() != *PI) {
+ Wrapper->eraseFromParent();
+ return nullptr;
+ }
+ Args.push_back(&*AI);
+ }
+ for (; PI != PE; ++PI)
+ Args.push_back(UndefValue::get(*PI));
+
+ CallInst *Call = CallInst::Create(F, Args, "", BB);
+
+ // Determine what value to return.
+ if (Ty->getReturnType()->isVoidTy())
+ ReturnInst::Create(M->getContext(), BB);
+ else if (F->getFunctionType()->getReturnType()->isVoidTy())
+ ReturnInst::Create(M->getContext(), UndefValue::get(Ty->getReturnType()),
+ BB);
+ else if (F->getFunctionType()->getReturnType() == Ty->getReturnType())
+ ReturnInst::Create(M->getContext(), Call, BB);
+ else {
+ Wrapper->eraseFromParent();
+ return nullptr;
+ }
+
+ return Wrapper;
+}
+
+bool FixFunctionBitcasts::runOnModule(Module &M) {
+ SmallVector<std::pair<Use *, Function *>, 0> Uses;
+
+ // Collect all the places that need wrappers.
+ for (Function &F : M)
+ FindUses(&F, F, Uses);
+
+ DenseMap<std::pair<Function *, FunctionType *>, Function *> Wrappers;
+
+ for (auto &UseFunc : Uses) {
+ Use *U = UseFunc.first;
+ Function *F = UseFunc.second;
+ PointerType *PTy = cast<PointerType>(U->get()->getType());
+ FunctionType *Ty = dyn_cast<FunctionType>(PTy->getElementType());
+
+ // If the function is casted to something like i8* as a "generic pointer"
+ // to be later casted to something else, we can't generate a wrapper for it.
+ // Just ignore such casts for now.
+ if (!Ty)
+ continue;
+
+ auto Pair = Wrappers.insert(std::make_pair(std::make_pair(F, Ty), nullptr));
+ if (Pair.second)
+ Pair.first->second = CreateWrapper(F, Ty);
+
+ Function *Wrapper = Pair.first->second;
+ if (!Wrapper)
+ continue;
+
+ if (isa<Constant>(U->get()))
+ U->get()->replaceAllUsesWith(Wrapper);
+ else
+ U->set(Wrapper);
+ }
+
+ return true;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
index 8a3248ee669e..e872dc219846 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInteger.td
@@ -40,8 +40,8 @@ defm ROTL : BinaryInt<rotl, "rotl", 0x77, 0x89>;
defm ROTR : BinaryInt<rotr, "rotr", 0x78, 0x8a>;
let isCommutable = 1 in {
-defm EQ : ComparisonInt<SETEQ, "eq ", 0x46, 0x68>;
-defm NE : ComparisonInt<SETNE, "ne ", 0x47, 0x69>;
+defm EQ : ComparisonInt<SETEQ, "eq ", 0x46, 0x51>;
+defm NE : ComparisonInt<SETNE, "ne ", 0x47, 0x52>;
} // isCommutable = 1
defm LT_S : ComparisonInt<SETLT, "lt_s", 0x48, 0x53>;
defm LT_U : ComparisonInt<SETULT, "lt_u", 0x49, 0x54>;
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index b61bc0a08143..f5ef35a2ad40 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -163,6 +163,10 @@ void WebAssemblyPassConfig::addIRPasses() {
// control specifically what gets lowered.
addPass(createAtomicExpandPass(TM));
+ // Fix function bitcasts, as WebAssembly requires caller and callee signatures
+ // to match.
+ addPass(createWebAssemblyFixFunctionBitcasts());
+
// Optimize "returned" function attributes.
if (getOptLevel() != CodeGenOpt::None)
addPass(createWebAssemblyOptimizeReturned());
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7f72ab17f619..db76ddf04c06 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6962,23 +6962,24 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
}
-/// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
-/// node.
-static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
- const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+/// Returns true iff \p BV builds a vector with the result equivalent to
+/// the result of ADDSUB operation.
+/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
+/// are written to the parameters \p Opnd0 and \p Opnd1.
+static bool isAddSub(const BuildVectorSDNode *BV,
+ const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ SDValue &Opnd0, SDValue &Opnd1) {
+
MVT VT = BV->getSimpleValueType(0);
if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
- (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
- return SDValue();
+ (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
+ (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
+ return false;
- SDLoc DL(BV);
unsigned NumElts = VT.getVectorNumElements();
SDValue InVec0 = DAG.getUNDEF(VT);
SDValue InVec1 = DAG.getUNDEF(VT);
- assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
- VT == MVT::v2f64) && "build_vector with an invalid type found!");
-
// Odd-numbered elements in the input build vector are obtained from
// adding two integer/float elements.
// Even-numbered elements in the input build vector are obtained from
@@ -7000,7 +7001,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
// Early exit if we found an unexpected opcode.
if (Opcode != ExpectedOpcode)
- return SDValue();
+ return false;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
@@ -7013,11 +7014,11 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
!isa<ConstantSDNode>(Op0.getOperand(1)) ||
!isa<ConstantSDNode>(Op1.getOperand(1)) ||
Op0.getOperand(1) != Op1.getOperand(1))
- return SDValue();
+ return false;
unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
if (I0 != i)
- return SDValue();
+ return false;
// We found a valid add/sub node. Update the information accordingly.
if (i & 1)
@@ -7029,39 +7030,118 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
if (InVec0.isUndef()) {
InVec0 = Op0.getOperand(0);
if (InVec0.getSimpleValueType() != VT)
- return SDValue();
+ return false;
}
if (InVec1.isUndef()) {
InVec1 = Op1.getOperand(0);
if (InVec1.getSimpleValueType() != VT)
- return SDValue();
+ return false;
}
// Make sure that operands in input to each add/sub node always
// come from a same pair of vectors.
if (InVec0 != Op0.getOperand(0)) {
if (ExpectedOpcode == ISD::FSUB)
- return SDValue();
+ return false;
// FADD is commutable. Try to commute the operands
// and then test again.
std::swap(Op0, Op1);
if (InVec0 != Op0.getOperand(0))
- return SDValue();
+ return false;
}
if (InVec1 != Op1.getOperand(0))
- return SDValue();
+ return false;
// Update the pair of expected opcodes.
std::swap(ExpectedOpcode, NextExpectedOpcode);
}
// Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
- if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
- return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
+ if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
+ return false;
- return SDValue();
+ Opnd0 = InVec0;
+ Opnd1 = InVec1;
+ return true;
+}
+
+/// Returns true if is possible to fold MUL and an idiom that has already been
+/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
+/// If (and only if) true is returned, the operands of FMADDSUB are written to
+/// parameters \p Opnd0, \p Opnd1, \p Opnd2.
+///
+/// Prior to calling this function it should be known that there is some
+/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
+/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
+/// before replacement of such SDNode with ADDSUB operation. Thus the number
+/// of \p Opnd0 uses is expected to be equal to 2.
+/// For example, this function may be called for the following IR:
+/// %AB = fmul fast <2 x double> %A, %B
+/// %Sub = fsub fast <2 x double> %AB, %C
+/// %Add = fadd fast <2 x double> %AB, %C
+/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
+/// <2 x i32> <i32 0, i32 3>
+/// There is a def for %Addsub here, which potentially can be replaced by
+/// X86ISD::ADDSUB operation:
+/// %Addsub = X86ISD::ADDSUB %AB, %C
+/// and such ADDSUB can further be replaced with FMADDSUB:
+/// %Addsub = FMADDSUB %A, %B, %C.
+///
+/// The main reason why this method is called before the replacement of the
+/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
+/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
+/// FMADDSUB is.
+static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
+ SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
+ if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
+ !Subtarget.hasAnyFMA())
+ return false;
+
+ // FIXME: These checks must match the similar ones in
+ // DAGCombiner::visitFADDForFMACombine. It would be good to have one
+ // function that would answer if it is Ok to fuse MUL + ADD to FMADD
+ // or MUL + ADDSUB to FMADDSUB.
+ const TargetOptions &Options = DAG.getTarget().Options;
+ bool AllowFusion =
+ (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
+ if (!AllowFusion)
+ return false;
+
+ Opnd2 = Opnd1;
+ Opnd1 = Opnd0.getOperand(1);
+ Opnd0 = Opnd0.getOperand(0);
+
+ return true;
+}
+
+/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
+/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
+static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Opnd0, Opnd1;
+ if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
+ return SDValue();
+
+ MVT VT = BV->getSimpleValueType(0);
+ SDLoc DL(BV);
+
+ // Try to generate X86ISD::FMADDSUB node here.
+ SDValue Opnd2;
+ if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
+ return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
+
+ // Do not generate X86ISD::ADDSUB node for 512-bit types even though
+ // the ADDSUB idiom has been successfully recognized. There are no known
+ // X86 targets with 512-bit ADDSUB instructions!
+ // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
+ // recognition.
+ if (VT.is512BitVector())
+ return SDValue();
+
+ return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
}
/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
@@ -7290,7 +7370,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return VectorConstant;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
- if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
+ if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
return AddSub;
if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
return HorizontalOp;
@@ -12965,6 +13045,12 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Subtarget.hasVBMI())
return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
+ // Try to create an in-lane repeating shuffle mask and then shuffle the
+ // the results into the target lanes.
+ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+ DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// FIXME: Implement direct support for this type!
return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
}
@@ -16985,9 +17071,16 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
}
- if (Cond.getOpcode() == ISD::SETCC)
- if (SDValue NewCond = LowerSETCC(Cond, DAG))
+ if (Cond.getOpcode() == ISD::SETCC) {
+ if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
Cond = NewCond;
+ // If the condition was updated, it's possible that the operands of the
+ // select were also updated (for example, EmitTest has a RAUW). Refresh
+ // the local references to the select operands in case they got stale.
+ Op1 = Op.getOperand(1);
+ Op2 = Op.getOperand(2);
+ }
+ }
// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
@@ -17193,22 +17286,26 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
return SDValue();
- if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
+ if (VT.is512BitVector() && InVTElt != MVT::i1) {
if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
}
- assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+ assert (InVTElt == MVT::i1 && "Unexpected vector type");
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
- SDValue NegOne = DAG.getConstant(
- APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
- SDValue Zero = DAG.getConstant(
- APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
+ SDValue V;
+ if (Subtarget.hasDQI()) {
+ V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
+ assert(!VT.is512BitVector() && "Unexpected vector type");
+ } else {
+ SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
+ SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
+ V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
+ if (VT.is512BitVector())
+ return V;
+ }
- SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
- if (VT.is512BitVector())
- return V;
return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
}
@@ -21528,6 +21625,23 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
}
+ // It's worth extending once and using the vXi16/vXi32 shifts for smaller
+ // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
+ // make the existing SSE solution better.
+ if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
+ (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
+ (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
+ (Subtarget.hasBWI() && VT == MVT::v32i8)) {
+ MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
+ MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
+ unsigned ExtOpc =
+ Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ R = DAG.getNode(ExtOpc, dl, ExtVT, R);
+ Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT,
+ DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
+ }
+
if (VT == MVT::v16i8 ||
(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
@@ -21636,19 +21750,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
}
}
- // It's worth extending once and using the v8i32 shifts for 16-bit types, but
- // the extra overheads to get from v16i8 to v8i32 make the existing SSE
- // solution better.
- if (Subtarget.hasInt256() && VT == MVT::v8i16) {
- MVT ExtVT = MVT::v8i32;
- unsigned ExtOpc =
- Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
- R = DAG.getNode(ExtOpc, dl, ExtVT, R);
- Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
- return DAG.getNode(ISD::TRUNCATE, dl, VT,
- DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
- }
-
if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
MVT ExtVT = MVT::v8i32;
SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
@@ -27763,29 +27864,32 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return SDValue();
}
-/// \brief Try to combine a shuffle into a target-specific add-sub node.
+/// Returns true iff the shuffle node \p N can be replaced with ADDSUB
+/// operation. If true is returned then the operands of ADDSUB operation
+/// are written to the parameters \p Opnd0 and \p Opnd1.
///
-/// We combine this directly on the abstract vector shuffle nodes so it is
-/// easier to generically match. We also insert dummy vector shuffle nodes for
-/// the operands which explicitly discard the lanes which are unused by this
-/// operation to try to flow through the rest of the combiner the fact that
-/// they're unused.
-static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- SDLoc DL(N);
+/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
+/// so it is easier to generically match. We also insert dummy vector shuffle
+/// nodes for the operands which explicitly discard the lanes which are unused
+/// by this operation to try to flow through the rest of the combiner
+/// the fact that they're unused.
+static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
+ SDValue &Opnd0, SDValue &Opnd1) {
+
EVT VT = N->getValueType(0);
if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
- (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
- return SDValue();
+ (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
+ (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
+ return false;
// We only handle target-independent shuffles.
// FIXME: It would be easy and harmless to use the target shuffle mask
// extraction tool to support more.
if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
- return SDValue();
+ return false;
ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
- SmallVector<int, 8> Mask(OrigMask.begin(), OrigMask.end());
+ SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
SDValue V1 = N->getOperand(0);
SDValue V2 = N->getOperand(1);
@@ -27796,27 +27900,57 @@ static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
ShuffleVectorSDNode::commuteMask(Mask);
std::swap(V1, V2);
} else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
- return SDValue();
+ return false;
// If there are other uses of these operations we can't fold them.
if (!V1->hasOneUse() || !V2->hasOneUse())
- return SDValue();
+ return false;
// Ensure that both operations have the same operands. Note that we can
// commute the FADD operands.
SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
- return SDValue();
+ return false;
// We're looking for blends between FADD and FSUB nodes. We insist on these
// nodes being lined up in a specific expected pattern.
if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
- isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
+ isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
+ isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
+ 8, 25, 10, 27, 12, 29, 14, 31})))
+ return false;
+
+ Opnd0 = LHS;
+ Opnd1 = RHS;
+ return true;
+}
+
+/// \brief Try to combine a shuffle into a target-specific add-sub or
+/// mul-add-sub node.
+static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ SDValue Opnd0, Opnd1;
+ if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // Try to generate X86ISD::FMADDSUB node here.
+ SDValue Opnd2;
+ if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
+ return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
+
+ // Do not generate X86ISD::ADDSUB node for 512-bit types even though
+ // the ADDSUB idiom has been successfully recognized. There are no known
+ // X86 targets with 512-bit ADDSUB instructions!
+ if (VT.is512BitVector())
return SDValue();
- return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
+ return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
}
// We are looking for a shuffle where both sources are concatenated with undef
@@ -27878,7 +28012,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// If we have legalized the vector types, look for blends of FADD and FSUB
// nodes that we can fuse into an ADDSUB node.
if (TLI.isTypeLegal(VT))
- if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
+ if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
// During Type Legalization, when promoting illegal vector types,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 908053e1342d..d44d1395f243 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -443,6 +443,22 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
[(set VR512:$dst, (v16i32 immAllOnesV))]>;
}
+// Alias instructions that allow VPTERNLOG to be used with a mask to create
+// a mix of all ones and all zeros elements. This is done this way to force
+// the same register to be used as input for all three sources.
+let isPseudo = 1, Predicates = [HasAVX512] in {
+def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst),
+ (ins VK16WM:$mask), "",
+ [(set VR512:$dst, (vselect (v16i1 VK16WM:$mask),
+ (v16i32 immAllOnesV),
+ (v16i32 immAllZerosV)))]>;
+def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst),
+ (ins VK8WM:$mask), "",
+ [(set VR512:$dst, (vselect (v8i1 VK8WM:$mask),
+ (bc_v8i64 (v16i32 immAllOnesV)),
+ (bc_v8i64 (v16i32 immAllZerosV))))]>;
+}
+
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in {
def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "",
@@ -1064,10 +1080,10 @@ def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
(v8f32 VR256X:$src), 1)>;
def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))),
(VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v4f64 VR256X:$src), 1)>;
+ (v4f64 VR256X:$src), 1)>;
def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))),
(VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v4i64 VR256X:$src), 1)>;
+ (v4i64 VR256X:$src), 1)>;
def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
(VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(v8i32 VR256X:$src), 1)>;
@@ -1485,8 +1501,7 @@ defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
// AVX-512 - BLEND using mask
//
multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in {
- let hasSideEffects = 0 in
+ let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
@@ -1496,16 +1511,13 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
- [(set _.RC:$dst, (vselect _.KRCWM:$mask,
- (_.VT _.RC:$src2),
- (_.VT _.RC:$src1)))]>, EVEX_4V, EVEX_K;
- let hasSideEffects = 0 in
+ []>, EVEX_4V, EVEX_K;
def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
[]>, EVEX_4V, EVEX_KZ;
- let mayLoad = 1, hasSideEffects = 0 in
+ let mayLoad = 1 in {
def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
@@ -1515,38 +1527,32 @@ multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
- [(set _.RC:$dst, (vselect _.KRCWM:$mask,
- (_.VT (bitconvert (_.LdFrag addr:$src2))),
- (_.VT _.RC:$src1)))]>,
- EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
- let mayLoad = 1, hasSideEffects = 0 in
+ []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
[]>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>;
}
+ }
}
multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+ let mayLoad = 1, hasSideEffects = 0 in {
def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
- [(set _.RC:$dst,(vselect _.KRCWM:$mask,
- (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
- (_.VT _.RC:$src1)))]>,
- EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+ []>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
- let mayLoad = 1, hasSideEffects = 0 in
def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
"$dst, $src1, ${src2}", _.BroadcastStr, "}"),
[]>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
-
+ }
}
multiclass blendmask_dq <bits<8> opc, string OpcodeStr,
@@ -1582,21 +1588,6 @@ defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>;
defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W;
-let Predicates = [HasAVX512, NoVLX] in {
-def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
- (v8f32 VR256X:$src2))),
- (EXTRACT_SUBREG
- (v16f32 (VBLENDMPSZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
- (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
- (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>;
-
-def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
- (v8i32 VR256X:$src2))),
- (EXTRACT_SUBREG
- (v16i32 (VPBLENDMDZrrk (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_ymm)>;
-}
//===----------------------------------------------------------------------===//
// Compare Instructions
//===----------------------------------------------------------------------===//
@@ -2735,7 +2726,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
(ins _.KRCWM:$mask, _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
"${dst} {${mask}} {z}, $src}"),
- [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+ [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
(_.VT _.RC:$src),
_.ImmAllZerosV)))], _.ExeDomain>,
EVEX, EVEX_KZ;
@@ -2972,6 +2963,30 @@ def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)),
(v16i32 VR512:$src))),
(VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>;
+// Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't
+// available. Use a 512-bit operation and extract.
+let Predicates = [HasAVX512, NoVLX] in {
+def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
+ (v8f32 VR256X:$src0))),
+ (EXTRACT_SUBREG
+ (v16f32
+ (VMOVAPSZrrk
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),
+ (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+ (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),
+ sub_ymm)>;
+
+def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
+ (v8i32 VR256X:$src0))),
+ (EXTRACT_SUBREG
+ (v16i32
+ (VMOVDQA32Zrrk
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src0, sub_ymm)),
+ (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))),
+ sub_ymm)>;
+}
+
let Predicates = [HasVLX, NoBWI] in {
// 128-bit load/store without BWI.
def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
@@ -3116,13 +3131,13 @@ let Predicates = [HasVLX] in {
(VMOVDQU32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
}
-
-// Move Int Doubleword to Packed Double Int
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(set VR128X:$dst,
+
+// Move Int Doubleword to Packed Double Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
(v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
EVEX;
def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
@@ -3152,47 +3167,47 @@ def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src
def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(store (i64 (bitconvert FR64X:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
- EVEX_CD8<64, CD8VT1>;
-}
-} // ExeDomain = SSEPackedInt
-
-// Move Int Doubleword to Single Scalar
-//
-let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
-def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(set FR32X:$dst, (bitconvert GR32:$src))],
+ IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
+ EVEX_CD8<64, CD8VT1>;
+}
+} // ExeDomain = SSEPackedInt
+
+// Move Int Doubleword to Single Scalar
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set FR32X:$dst, (bitconvert GR32:$src))],
IIC_SSE_MOVDQ>, EVEX;
def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
- IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
-} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
-
-// Move doubleword from xmm register to r/m32
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+// Move doubleword from xmm register to r/m32
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
(iPTR 0)))], IIC_SSE_MOVD_ToGP>,
EVEX;
def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, VR128X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
- [(store (i32 (extractelt (v4i32 VR128X:$src),
- (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
- EVEX, EVEX_CD8<32, CD8VT1>;
-} // ExeDomain = SSEPackedInt
-
-// Move quadword from xmm1 register to r/m64
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+ [(store (i32 (extractelt (v4i32 VR128X:$src),
+ (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+ EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt
+
+// Move quadword from xmm1 register to r/m64
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
(iPTR 0)))],
IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
Requires<[HasAVX512, In64BitMode]>;
@@ -3213,39 +3228,39 @@ def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
let hasSideEffects = 0 in
def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
- (ins VR128X:$src),
- "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
- EVEX, VEX_W;
-} // ExeDomain = SSEPackedInt
-
-// Move Scalar Single to Double Int
-//
-let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
-def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
- (ins FR32X:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
+ (ins VR128X:$src),
+ "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
+ EVEX, VEX_W;
+} // ExeDomain = SSEPackedInt
+
+// Move Scalar Single to Double Int
+//
+let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
+def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
+ (ins FR32X:$src),
+ "vmovd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32X:$src))],
IIC_SSE_MOVD_ToGP>, EVEX;
def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, FR32X:$src),
- "vmovd\t{$src, $dst|$dst, $src}",
- [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
-} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
-
-// Move Quadword Int to Packed Quadword Int
-//
-let ExeDomain = SSEPackedInt in {
-def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
- (ins i64mem:$src),
- "vmovq\t{$src, $dst|$dst, $src}",
- [(set VR128X:$dst,
- (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
- EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
-} // ExeDomain = SSEPackedInt
-
-//===----------------------------------------------------------------------===//
-// AVX-512 MOVSS, MOVSD
+ "vmovd\t{$src, $dst|$dst, $src}",
+ [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
+
+// Move Quadword Int to Packed Quadword Int
+//
+let ExeDomain = SSEPackedInt in {
+def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+ (ins i64mem:$src),
+ "vmovq\t{$src, $dst|$dst, $src}",
+ [(set VR128X:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+ EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
+} // ExeDomain = SSEPackedInt
+
+//===----------------------------------------------------------------------===//
+// AVX-512 MOVSS, MOVSD
//===----------------------------------------------------------------------===//
multiclass avx512_move_scalar<string asm, SDNode OpNode,
@@ -8646,6 +8661,28 @@ def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
+ (v2f64 VR128X:$src0)),
+ (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
+ (bitconvert (v4i32 immAllZerosV))),
+ (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+ (v2f64 VR128X:$src0)),
+ (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask,
+ (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+ (bitconvert (v4i32 immAllZerosV))),
+ (VMOVDDUPZ128rrkz VK2WM:$mask, (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (v2f64 VR128X:$src0)),
+ (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (bitconvert (v4i32 immAllZerosV))),
+ (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 579359794fbd..e3484d062bc8 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -543,7 +543,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::MOV8rr, X86::MOV8rm, 0 },
{ X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
- { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 },
+ { X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE },
{ X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
{ X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
{ X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
@@ -661,7 +661,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
{ X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
{ X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
- { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 },
+ { X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE },
{ X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
{ X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
{ X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
@@ -6864,6 +6864,21 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
.addReg(Reg, RegState::Undef).addImm(0xff);
return true;
}
+ case X86::AVX512_512_SEXT_MASK_32:
+ case X86::AVX512_512_SEXT_MASK_64: {
+ unsigned Reg = MIB->getOperand(0).getReg();
+ unsigned MaskReg = MIB->getOperand(1).getReg();
+ unsigned MaskState = getRegState(MIB->getOperand(1));
+ unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
+ X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
+ MI.RemoveOperand(1);
+ MIB->setDesc(get(Opc));
+ // VPTERNLOG needs 3 register inputs and an immediate.
+ // 0xff will return 1s for any input.
+ MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
+ .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff);
+ return true;
+ }
case X86::VMOVAPSZ128rm_NOVLX:
return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 4cd6ae563f03..09971d586a41 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -6397,7 +6397,7 @@ let Predicates = [HasAVX] in {
defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround",
int_x86_sse41_round_ss,
int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
- defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
+ defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
}
let Predicates = [UseAVX] in {
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index de4839432b9a..107ed9359376 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -144,6 +144,10 @@ int X86TTIImpl::getArithmeticInstrCost(
}
static const CostTblEntry AVX512BWUniformConstCostTable[] = {
+ { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
+
{ ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
{ ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
};
@@ -168,6 +172,10 @@ int X86TTIImpl::getArithmeticInstrCost(
}
static const CostTblEntry AVX2UniformConstCostTable[] = {
+ { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
+
{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
{ ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
@@ -184,6 +192,14 @@ int X86TTIImpl::getArithmeticInstrCost(
}
static const CostTblEntry SSE2UniformConstCostTable[] = {
+ { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
+ { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
+ { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
+
+ { ISD::SHL, MVT::v32i8, 4 }, // 2*(psllw + pand).
+ { ISD::SRL, MVT::v32i8, 4 }, // 2*(psrlw + pand).
+ { ISD::SRA, MVT::v32i8, 8 }, // 2*(psrlw, pand, pxor, psubb).
+
{ ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
{ ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
{ ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
@@ -207,6 +223,43 @@ int X86TTIImpl::getArithmeticInstrCost(
return LT.first * Entry->Cost;
}
+ static const CostTblEntry AVX2UniformCostTable[] = {
+ // Uniform splats are cheaper for the following instructions.
+ { ISD::SHL, MVT::v16i16, 1 }, // psllw.
+ { ISD::SRL, MVT::v16i16, 1 }, // psrlw.
+ { ISD::SRA, MVT::v16i16, 1 }, // psraw.
+ };
+
+ if (ST->hasAVX2() &&
+ ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+ (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+ if (const auto *Entry =
+ CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
+ static const CostTblEntry SSE2UniformCostTable[] = {
+ // Uniform splats are cheaper for the following instructions.
+ { ISD::SHL, MVT::v8i16, 1 }, // psllw.
+ { ISD::SHL, MVT::v4i32, 1 }, // pslld
+ { ISD::SHL, MVT::v2i64, 1 }, // psllq.
+
+ { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
+ { ISD::SRL, MVT::v4i32, 1 }, // psrld.
+ { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
+
+ { ISD::SRA, MVT::v8i16, 1 }, // psraw.
+ { ISD::SRA, MVT::v4i32, 1 }, // psrad.
+ };
+
+ if (ST->hasSSE2() &&
+ ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+ (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+ if (const auto *Entry =
+ CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+ }
+
static const CostTblEntry AVX512DQCostTable[] = {
{ ISD::MUL, MVT::v2i64, 1 },
{ ISD::MUL, MVT::v4i64, 1 },
@@ -219,6 +272,10 @@ int X86TTIImpl::getArithmeticInstrCost(
return LT.first * Entry->Cost;
static const CostTblEntry AVX512BWCostTable[] = {
+ { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
+ { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
+ { ISD::SRA, MVT::v32i16, 1 }, // vpsravw
+
{ ISD::MUL, MVT::v64i8, 11 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v32i8, 4 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 4 }, // extend/pmullw/trunc sequence.
@@ -259,7 +316,7 @@ int X86TTIImpl::getArithmeticInstrCost(
if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- static const CostTblEntry AVX2CostTable[] = {
+ static const CostTblEntry AVX2ShiftCostTable[] = {
// Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
// customize them to detect the cases where shift amount is a scalar one.
{ ISD::SHL, MVT::v4i32, 1 },
@@ -283,11 +340,11 @@ int X86TTIImpl::getArithmeticInstrCost(
// is lowered into a vector multiply (vpmullw).
return LT.first;
- if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+ if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
- static const CostTblEntry XOPCostTable[] = {
+ static const CostTblEntry XOPShiftCostTable[] = {
// 128bit shifts take 1cy, but right shifts require negation beforehand.
{ ISD::SHL, MVT::v16i8, 1 },
{ ISD::SRL, MVT::v16i8, 2 },
@@ -318,93 +375,20 @@ int X86TTIImpl::getArithmeticInstrCost(
// Look for XOP lowering tricks.
if (ST->hasXOP())
- if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second))
+ if (const auto *Entry = CostTableLookup(XOPShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- static const CostTblEntry AVX2CustomCostTable[] = {
- { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
- { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
-
- { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
- { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
-
- { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
- { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
- { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
- { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
-
- { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v8i32, 1 }, // pmulld
- { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
-
- { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
- { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
- { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
- { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
- { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
- { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
- };
-
- // Look for AVX2 lowering tricks for custom cases.
- if (ST->hasAVX2())
- if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD,
- LT.second))
- return LT.first * Entry->Cost;
-
- static const CostTblEntry AVXCustomCostTable[] = {
- { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
-
- { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
- { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
-
- // Vectorizing division is a bad idea. See the SSE2 table for more comments.
- { ISD::SDIV, MVT::v32i8, 32*20 },
- { ISD::SDIV, MVT::v16i16, 16*20 },
- { ISD::SDIV, MVT::v8i32, 8*20 },
- { ISD::SDIV, MVT::v4i64, 4*20 },
- { ISD::UDIV, MVT::v32i8, 32*20 },
- { ISD::UDIV, MVT::v16i16, 16*20 },
- { ISD::UDIV, MVT::v8i32, 8*20 },
- { ISD::UDIV, MVT::v4i64, 4*20 },
- };
-
- // Look for AVX2 lowering tricks for custom cases.
- if (ST->hasAVX())
- if (const auto *Entry = CostTableLookup(AVXCustomCostTable, ISD,
- LT.second))
- return LT.first * Entry->Cost;
-
- static const CostTblEntry
- SSE2UniformCostTable[] = {
+ static const CostTblEntry SSE2UniformShiftCostTable[] = {
// Uniform splats are cheaper for the following instructions.
- { ISD::SHL, MVT::v16i8, 1 }, // psllw.
- { ISD::SHL, MVT::v32i8, 2 }, // psllw.
- { ISD::SHL, MVT::v8i16, 1 }, // psllw.
{ ISD::SHL, MVT::v16i16, 2 }, // psllw.
- { ISD::SHL, MVT::v4i32, 1 }, // pslld
{ ISD::SHL, MVT::v8i32, 2 }, // pslld
- { ISD::SHL, MVT::v2i64, 1 }, // psllq.
{ ISD::SHL, MVT::v4i64, 2 }, // psllq.
- { ISD::SRL, MVT::v16i8, 1 }, // psrlw.
- { ISD::SRL, MVT::v32i8, 2 }, // psrlw.
- { ISD::SRL, MVT::v8i16, 1 }, // psrlw.
{ ISD::SRL, MVT::v16i16, 2 }, // psrlw.
- { ISD::SRL, MVT::v4i32, 1 }, // psrld.
{ ISD::SRL, MVT::v8i32, 2 }, // psrld.
- { ISD::SRL, MVT::v2i64, 1 }, // psrlq.
{ ISD::SRL, MVT::v4i64, 2 }, // psrlq.
- { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
- { ISD::SRA, MVT::v32i8, 8 }, // psrlw, pand, pxor, psubb.
- { ISD::SRA, MVT::v8i16, 1 }, // psraw.
{ ISD::SRA, MVT::v16i16, 2 }, // psraw.
- { ISD::SRA, MVT::v4i32, 1 }, // psrad.
{ ISD::SRA, MVT::v8i32, 2 }, // psrad.
{ ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle.
{ ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle.
@@ -414,7 +398,7 @@ int X86TTIImpl::getArithmeticInstrCost(
((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
(Op2Info == TargetTransformInfo::OK_UniformValue))) {
if (const auto *Entry =
- CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
+ CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
@@ -422,24 +406,98 @@ int X86TTIImpl::getArithmeticInstrCost(
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
MVT VT = LT.second;
// Vector shift left by non uniform constant can be lowered
- // into vector multiply (pmullw/pmulld).
- if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
- (VT == MVT::v4i32 && ST->hasSSE41()))
- return LT.first;
-
- // v16i16 and v8i32 shifts by non-uniform constants are lowered into a
- // sequence of extract + two vector multiply + insert.
- if ((VT == MVT::v8i32 || VT == MVT::v16i16) &&
- (ST->hasAVX() && !ST->hasAVX2()))
- ISD = ISD::MUL;
-
- // A vector shift left by non uniform constant is converted
- // into a vector multiply; the new multiply is eventually
- // lowered into a sequence of shuffles and 2 x pmuludq.
- if (VT == MVT::v4i32 && ST->hasSSE2())
+ // into vector multiply.
+ if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
+ ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
ISD = ISD::MUL;
}
+ static const CostTblEntry AVX2CostTable[] = {
+ { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+
+ { ISD::SRL, MVT::v32i8, 11 }, // vpblendvb sequence.
+ { ISD::SRL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence.
+
+ { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence.
+ { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence.
+ { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence.
+ { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence.
+
+ { ISD::SUB, MVT::v32i8, 1 }, // psubb
+ { ISD::ADD, MVT::v32i8, 1 }, // paddb
+ { ISD::SUB, MVT::v16i16, 1 }, // psubw
+ { ISD::ADD, MVT::v16i16, 1 }, // paddw
+ { ISD::SUB, MVT::v8i32, 1 }, // psubd
+ { ISD::ADD, MVT::v8i32, 1 }, // paddd
+ { ISD::SUB, MVT::v4i64, 1 }, // psubq
+ { ISD::ADD, MVT::v4i64, 1 }, // paddq
+
+ { ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v16i16, 1 }, // pmullw
+ { ISD::MUL, MVT::v8i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
+
+ { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
+ };
+
+ // Look for AVX2 lowering tricks for custom cases.
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
+ static const CostTblEntry AVX1CostTable[] = {
+ // We don't have to scalarize unsupported ops. We can issue two half-sized
+ // operations and we only need to extract the upper YMM half.
+ // Two ops + 1 extract + 1 insert = 4.
+ { ISD::MUL, MVT::v16i16, 4 },
+ { ISD::MUL, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v32i8, 4 },
+ { ISD::ADD, MVT::v32i8, 4 },
+ { ISD::SUB, MVT::v16i16, 4 },
+ { ISD::ADD, MVT::v16i16, 4 },
+ { ISD::SUB, MVT::v8i32, 4 },
+ { ISD::ADD, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v4i64, 4 },
+ { ISD::ADD, MVT::v4i64, 4 },
+
+ // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+ // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
+ // Because we believe v4i64 to be a legal type, we must also include the
+ // extract+insert in the cost table. Therefore, the cost here is 18
+ // instead of 8.
+ { ISD::MUL, MVT::v4i64, 18 },
+
+ { ISD::MUL, MVT::v32i8, 26 }, // extend/pmullw/trunc sequence.
+
+ { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
+ { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
+
+ // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+ { ISD::SDIV, MVT::v32i8, 32*20 },
+ { ISD::SDIV, MVT::v16i16, 16*20 },
+ { ISD::SDIV, MVT::v8i32, 8*20 },
+ { ISD::SDIV, MVT::v4i64, 4*20 },
+ { ISD::UDIV, MVT::v32i8, 32*20 },
+ { ISD::UDIV, MVT::v16i16, 16*20 },
+ { ISD::UDIV, MVT::v8i32, 8*20 },
+ { ISD::UDIV, MVT::v4i64, 4*20 },
+ };
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
+ return LT.first * Entry->Cost;
+
static const CostTblEntry SSE42CostTable[] = {
{ ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
@@ -456,6 +514,8 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence.
{ ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence.
{ ISD::SHL, MVT::v16i16, 2*14 }, // pblendvb sequence.
+ { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
+ { ISD::SHL, MVT::v8i32, 2*4 }, // pslld/paddd/cvttps2dq/pmulld
{ ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence.
{ ISD::SRL, MVT::v32i8, 2*12 }, // pblendvb sequence.
@@ -501,6 +561,7 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence.
{ ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence.
+ { ISD::MUL, MVT::v8i16, 1 }, // pmullw
{ ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
{ ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
@@ -516,46 +577,19 @@ int X86TTIImpl::getArithmeticInstrCost(
// generally a bad idea. Assume somewhat arbitrarily that we have to be able
// to hide "20 cycles" for each lane.
{ ISD::SDIV, MVT::v16i8, 16*20 },
- { ISD::SDIV, MVT::v8i16, 8*20 },
- { ISD::SDIV, MVT::v4i32, 4*20 },
- { ISD::SDIV, MVT::v2i64, 2*20 },
+ { ISD::SDIV, MVT::v8i16, 8*20 },
+ { ISD::SDIV, MVT::v4i32, 4*20 },
+ { ISD::SDIV, MVT::v2i64, 2*20 },
{ ISD::UDIV, MVT::v16i8, 16*20 },
- { ISD::UDIV, MVT::v8i16, 8*20 },
- { ISD::UDIV, MVT::v4i32, 4*20 },
- { ISD::UDIV, MVT::v2i64, 2*20 },
+ { ISD::UDIV, MVT::v8i16, 8*20 },
+ { ISD::UDIV, MVT::v4i32, 4*20 },
+ { ISD::UDIV, MVT::v2i64, 2*20 },
};
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
- static const CostTblEntry AVX1CostTable[] = {
- // We don't have to scalarize unsupported ops. We can issue two half-sized
- // operations and we only need to extract the upper YMM half.
- // Two ops + 1 extract + 1 insert = 4.
- { ISD::MUL, MVT::v16i16, 4 },
- { ISD::MUL, MVT::v8i32, 4 },
- { ISD::SUB, MVT::v32i8, 4 },
- { ISD::ADD, MVT::v32i8, 4 },
- { ISD::SUB, MVT::v16i16, 4 },
- { ISD::ADD, MVT::v16i16, 4 },
- { ISD::SUB, MVT::v8i32, 4 },
- { ISD::ADD, MVT::v8i32, 4 },
- { ISD::SUB, MVT::v4i64, 4 },
- { ISD::ADD, MVT::v4i64, 4 },
- // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
- // are lowered as a series of long multiplies(3), shifts(3) and adds(2)
- // Because we believe v4i64 to be a legal type, we must also include the
- // extract+insert in the cost table. Therefore, the cost here is 18
- // instead of 8.
- { ISD::MUL, MVT::v4i64, 18 },
- };
-
- // Look for AVX1 lowering tricks.
- if (ST->hasAVX() && !ST->hasAVX2())
- if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
- return LT.first * Entry->Cost;
-
static const CostTblEntry SSE1CostTable[] = {
{ ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
@@ -639,8 +673,7 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw
{ TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw
- { TTI::SK_Reverse, MVT::v64i8, 6 }, // vextracti64x4 + 2*vperm2i128
- // + 2*pshufb + vinserti64x4
+ { TTI::SK_Reverse, MVT::v64i8, 2 }, // pshufb + vshufi64x2
{ TTI::SK_PermuteSingleSrc, MVT::v32i16, 1 }, // vpermw
{ TTI::SK_PermuteSingleSrc, MVT::v16i16, 1 }, // vpermw
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index f4742aaf748f..82daf754be0d 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -42,6 +42,8 @@
using namespace llvm;
using namespace lowertypetests;
+using SummaryAction = LowerTypeTestsSummaryAction;
+
#define DEBUG_TYPE "lowertypetests"
STATISTIC(ByteArraySizeBits, "Byte array size in bits");
@@ -55,9 +57,15 @@ static cl::opt<bool> AvoidReuse(
cl::desc("Try to avoid reuse of byte array addresses using aliases"),
cl::Hidden, cl::init(true));
-static cl::opt<std::string> ClSummaryAction(
+static cl::opt<SummaryAction> ClSummaryAction(
"lowertypetests-summary-action",
- cl::desc("What to do with the summary when running this pass"), cl::Hidden);
+ cl::desc("What to do with the summary when running this pass"),
+ cl::values(clEnumValN(SummaryAction::None, "none", "Do nothing"),
+ clEnumValN(SummaryAction::Import, "import",
+ "Import typeid resolutions from summary and globals"),
+ clEnumValN(SummaryAction::Export, "export",
+ "Export typeid resolutions to summary and globals")),
+ cl::Hidden);
static cl::opt<std::string> ClReadSummary(
"lowertypetests-read-summary",
@@ -226,8 +234,8 @@ public:
class LowerTypeTestsModule {
Module &M;
- // This is for testing purposes only.
- std::unique_ptr<ModuleSummaryIndex> OwnedSummary;
+ SummaryAction Action;
+ ModuleSummaryIndex *Summary;
bool LinkerSubsectionsViaSymbols;
Triple::ArchType Arch;
@@ -319,21 +327,38 @@ class LowerTypeTestsModule {
void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions);
public:
- LowerTypeTestsModule(Module &M);
- ~LowerTypeTestsModule();
+ LowerTypeTestsModule(Module &M, SummaryAction Action,
+ ModuleSummaryIndex *Summary);
bool lower();
+
+ // Lower the module using the action and summary passed as command line
+ // arguments. For testing purposes only.
+ static bool runForTesting(Module &M);
};
struct LowerTypeTests : public ModulePass {
static char ID;
- LowerTypeTests() : ModulePass(ID) {
+
+ bool UseCommandLine = false;
+
+ SummaryAction Action;
+ ModuleSummaryIndex *Summary;
+
+ LowerTypeTests() : ModulePass(ID), UseCommandLine(true) {
+ initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
+ }
+
+ LowerTypeTests(SummaryAction Action, ModuleSummaryIndex *Summary)
+ : ModulePass(ID), Action(Action), Summary(Summary) {
initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
}
bool runOnModule(Module &M) override {
if (skipModule(M))
return false;
- return LowerTypeTestsModule(M).lower();
+ if (UseCommandLine)
+ return LowerTypeTestsModule::runForTesting(M);
+ return LowerTypeTestsModule(M, Action, Summary).lower();
}
};
@@ -343,7 +368,10 @@ INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
false)
char LowerTypeTests::ID = 0;
-ModulePass *llvm::createLowerTypeTestsPass() { return new LowerTypeTests; }
+ModulePass *llvm::createLowerTypeTestsPass(SummaryAction Action,
+ ModuleSummaryIndex *Summary) {
+ return new LowerTypeTests(Action, Summary);
+}
/// Build a bit set for TypeId using the object layouts in
/// GlobalLayout.
@@ -1145,22 +1173,12 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
}
/// Lower all type tests in this module.
-LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) {
- // Handle the command-line summary arguments. This code is for testing
- // purposes only, so we handle errors directly.
- if (!ClSummaryAction.empty()) {
- OwnedSummary = make_unique<ModuleSummaryIndex>();
- if (!ClReadSummary.empty()) {
- ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
- ": ");
- auto ReadSummaryFile =
- ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
-
- yaml::Input In(ReadSummaryFile->getBuffer());
- In >> *OwnedSummary;
- ExitOnErr(errorCodeToError(In.error()));
- }
- }
+LowerTypeTestsModule::LowerTypeTestsModule(Module &M, SummaryAction Action,
+ ModuleSummaryIndex *Summary)
+ : M(M), Action(Action), Summary(Summary) {
+ // FIXME: Use these fields.
+ (void)this->Action;
+ (void)this->Summary;
Triple TargetTriple(M.getTargetTriple());
LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
@@ -1169,18 +1187,36 @@ LowerTypeTestsModule::LowerTypeTestsModule(Module &M) : M(M) {
ObjectFormat = TargetTriple.getObjectFormat();
}
-LowerTypeTestsModule::~LowerTypeTestsModule() {
- if (ClSummaryAction.empty() || ClWriteSummary.empty())
- return;
+bool LowerTypeTestsModule::runForTesting(Module &M) {
+ ModuleSummaryIndex Summary;
- ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
- ": ");
- std::error_code EC;
- raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
- ExitOnErr(errorCodeToError(EC));
+ // Handle the command-line summary arguments. This code is for testing
+ // purposes only, so we handle errors directly.
+ if (!ClReadSummary.empty()) {
+ ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
+ ": ");
+ auto ReadSummaryFile =
+ ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+
+ yaml::Input In(ReadSummaryFile->getBuffer());
+ In >> Summary;
+ ExitOnErr(errorCodeToError(In.error()));
+ }
+
+ bool Changed = LowerTypeTestsModule(M, ClSummaryAction, &Summary).lower();
+
+ if (!ClWriteSummary.empty()) {
+ ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
+ ": ");
+ std::error_code EC;
+ raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+ ExitOnErr(errorCodeToError(EC));
+
+ yaml::Output Out(OS);
+ Out << Summary;
+ }
- yaml::Output Out(OS);
- Out << *OwnedSummary;
+ return Changed;
}
bool LowerTypeTestsModule::lower() {
@@ -1313,7 +1349,8 @@ bool LowerTypeTestsModule::lower() {
PreservedAnalyses LowerTypeTestsPass::run(Module &M,
ModuleAnalysisManager &AM) {
- bool Changed = LowerTypeTestsModule(M).lower();
+ bool Changed =
+ LowerTypeTestsModule(M, SummaryAction::None, /*Summary=*/nullptr).lower();
if (!Changed)
return PreservedAnalyses::all();
return PreservedAnalyses::none();
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 293ddf21a68f..d086ee05a64f 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -857,7 +857,8 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
// Lower type metadata and the type.test intrinsic. This pass supports Clang's
// control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
// link time if CFI is enabled. The pass does nothing if CFI is disabled.
- PM.add(createLowerTypeTestsPass());
+ PM.add(createLowerTypeTestsPass(LowerTypeTestsSummaryAction::None,
+ /*Summary=*/nullptr));
if (OptLevel != 0)
addLateLTOOptimizationPasses(PM);
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 012bfc7b4944..013159cde774 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1903,7 +1903,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
return foldICmpShlOne(Cmp, Shl, C);
// Check that the shift amount is in range. If not, don't perform undefined
- // shifts. When the shift is visited it will be simplified.
+ // shifts. When the shift is visited, it will be simplified.
unsigned TypeBits = C->getBitWidth();
if (ShiftAmt->uge(TypeBits))
return nullptr;
@@ -1923,7 +1923,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
return new ICmpInst(Pred, X, LShrC);
if (Shl->hasOneUse()) {
- // Otherwise strength reduce the shift into an and.
+ // Otherwise, strength reduce the shift into an and.
Constant *Mask = ConstantInt::get(Shl->getType(),
APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
@@ -1951,7 +1951,7 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
}
// When the shift is nuw and pred is >u or <=u, comparison only really happens
- // in the pre-shifted bits. Since InstSimplify canoncalizes <=u into <u, the
+ // in the pre-shifted bits. Since InstSimplify canonicalizes <=u into <u, the
// <=u case can be further converted to match <u (see below).
if (Shl->hasNoUnsignedWrap() &&
(Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT)) {
@@ -1970,9 +1970,9 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
// Transform (icmp pred iM (shl iM %v, N), C)
// -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (C>>N))
// Transform the shl to a trunc if (trunc (C>>N)) has no loss and M-N.
- // This enables us to get rid of the shift in favor of a trunc which can be
+ // This enables us to get rid of the shift in favor of a trunc that may be
// free on the target. It has the additional benefit of comparing to a
- // smaller constant, which will be target friendly.
+ // smaller constant that may be more target-friendly.
unsigned Amt = ShiftAmt->getLimitedValue(TypeBits - 1);
if (Shl->hasOneUse() && Amt != 0 && C->countTrailingZeros() >= Amt &&
DL.isLegalInteger(TypeBits - Amt)) {
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 1d5528398776..54bdc9e0772b 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1818,6 +1818,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M) {
RegisteredFlag = new GlobalVariable(
M, IntptrTy, false, GlobalVariable::CommonLinkage,
ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+ RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
// Update llvm.compiler.used, adding the new liveness globals. This is
// needed so that during LTO these variables stay alive. The alternative
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 6aeb5237ffe3..68faa886060a 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1423,7 +1423,7 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
if (widenLoopCompare(DU))
return nullptr;
- // This user does not evaluate to a recurence after widening, so don't
+ // This user does not evaluate to a recurrence after widening, so don't
// follow it. Instead insert a Trunc to kill off the original use,
// eventually isolating the original narrow IV so it can be removed.
truncateIVUse(DU, DT, LI);
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 08e7acdaaf72..8fb580183e30 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -415,7 +415,9 @@ public:
Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
PH->getTerminator());
Value *Initial =
- new LoadInst(InitialPtr, "load_initial", PH->getTerminator());
+ new LoadInst(InitialPtr, "load_initial", /* isVolatile */ false,
+ Cand.Load->getAlignment(), PH->getTerminator());
+
PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
&L->getHeader()->front());
PHI->addIncoming(Initial, PH);
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 6f7682c96cef..76fe91884c7b 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -1382,8 +1382,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(),
Succ->begin(), Succ->end());
LPM->deleteSimpleAnalysisValue(BI, L);
- BI->eraseFromParent();
RemoveFromWorklist(BI, Worklist);
+ BI->eraseFromParent();
// Remove Succ from the loop tree.
LI->removeBlock(Succ);
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 8b8236390bf4..eef7db08cd46 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -79,7 +79,8 @@ STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
STATISTIC(NumGVNBlocksDeleted, "Number of blocks deleted");
STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified");
STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same");
-STATISTIC(NumGVNMaxIterations, "Maximum Number of iterations it took to converge GVN");
+STATISTIC(NumGVNMaxIterations,
+ "Maximum Number of iterations it took to converge GVN");
//===----------------------------------------------------------------------===//
// GVN Pass
@@ -327,7 +328,7 @@ private:
// Elimination.
struct ValueDFS;
void convertDenseToDFSOrdered(CongruenceClass::MemberSet &,
- std::vector<ValueDFS> &);
+ SmallVectorImpl<ValueDFS> &);
bool eliminateInstructions(Function &);
void replaceInstruction(Instruction *, Value *);
@@ -336,8 +337,11 @@ private:
// New instruction creation.
void handleNewInstruction(Instruction *){};
+
+ // Various instruction touch utilities
void markUsersTouched(Value *);
void markMemoryUsersTouched(MemoryAccess *);
+ void markLeaderChangeTouched(CongruenceClass *CC);
// Utilities.
void cleanupTables();
@@ -390,10 +394,10 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_END(NewGVN, "newgvn", "Global Value Numbering", false, false)
PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
- BasicBlock *PhiBlock = I->getParent();
+ BasicBlock *PHIBlock = I->getParent();
auto *PN = cast<PHINode>(I);
- auto *E = new (ExpressionAllocator)
- PHIExpression(PN->getNumOperands(), I->getParent());
+ auto *E =
+ new (ExpressionAllocator) PHIExpression(PN->getNumOperands(), PHIBlock);
E->allocateOperands(ArgRecycler, ExpressionAllocator);
E->setType(I->getType());
@@ -408,10 +412,10 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
[&](const Use &U) -> Value * {
- // Don't try to transform self-defined phis
+ // Don't try to transform self-defined phis.
if (U == PN)
return PN;
- const BasicBlockEdge BBE(PN->getIncomingBlock(U), PhiBlock);
+ const BasicBlockEdge BBE(PN->getIncomingBlock(U), PHIBlock);
return lookupOperandLeader(U, I, BBE);
});
return E;
@@ -710,6 +714,15 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
return E;
}
+// Utility function to check whether the congruence class has a member other
+// than the given instruction.
+bool hasMemberOtherThanUs(const CongruenceClass *CC, Instruction *I) {
+ // Either it has more than one member, in which case it must contain something
+ // other than us (because it's indexed by value), or if it only has one member
+ // right now, that member should not be us.
+ return CC->Members.size() > 1 || CC->Members.count(I) == 0;
+}
+
const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
const BasicBlock *B) {
// Unlike loads, we never try to eliminate stores, so we do not check if they
@@ -725,8 +738,12 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
cast<MemoryDef>(StoreAccess)->getDefiningAccess());
const Expression *OldStore = createStoreExpression(SI, StoreRHS, B);
CongruenceClass *CC = ExpressionToClass.lookup(OldStore);
+ // Basically, check if the congruence class the store is in is defined by a
+ // store that isn't us, and has the same value. MemorySSA takes care of
+ // ensuring the store has the same memory state as us already.
if (CC && CC->DefiningExpr && isa<StoreExpression>(CC->DefiningExpr) &&
- CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B))
+ CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B) &&
+ hasMemberOtherThanUs(CC, I))
return createStoreExpression(SI, StoreRHS, B);
}
@@ -810,36 +827,50 @@ bool NewGVN::setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To) {
const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
const BasicBlock *B) {
auto *E = cast<PHIExpression>(createPHIExpression(I));
- if (E->op_empty()) {
+ // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
+
+ // See if all arguaments are the same.
+ // We track if any were undef because they need special handling.
+ bool HasUndef = false;
+ auto Filtered = make_filter_range(E->operands(), [&](const Value *Arg) {
+ if (Arg == I)
+ return false;
+ if (isa<UndefValue>(Arg)) {
+ HasUndef = true;
+ return false;
+ }
+ return true;
+ });
+ // If we are left with no operands, it's undef
+ if (Filtered.begin() == Filtered.end()) {
DEBUG(dbgs() << "Simplified PHI node " << *I << " to undef"
<< "\n");
E->deallocateOperands(ArgRecycler);
ExpressionAllocator.Deallocate(E);
return createConstantExpression(UndefValue::get(I->getType()));
}
-
- Value *AllSameValue = E->getOperand(0);
-
- // See if all arguments are the same, ignoring undef arguments, because we can
- // choose a value that is the same for them.
- for (const Value *Arg : E->operands())
- if (Arg != AllSameValue && !isa<UndefValue>(Arg)) {
- AllSameValue = nullptr;
- break;
+ Value *AllSameValue = *(Filtered.begin());
+ ++Filtered.begin();
+ // Can't use std::equal here, sadly, because filter.begin moves.
+ if (llvm::all_of(Filtered, [AllSameValue](const Value *V) {
+ return V == AllSameValue;
+ })) {
+ // In LLVM's non-standard representation of phi nodes, it's possible to have
+ // phi nodes with cycles (IE dependent on other phis that are .... dependent
+ // on the original phi node), especially in weird CFG's where some arguments
+ // are unreachable, or uninitialized along certain paths. This can cause
+ // infinite loops during evaluation. We work around this by not trying to
+ // really evaluate them independently, but instead using a variable
+ // expression to say if one is equivalent to the other.
+ // We also special case undef, so that if we have an undef, we can't use the
+ // common value unless it dominates the phi block.
+ if (HasUndef) {
+ // Only have to check for instructions
+ if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
+ if (!DT->dominates(AllSameInst, I))
+ return E;
}
- if (AllSameValue) {
- // It's possible to have phi nodes with cycles (IE dependent on
- // other phis that are .... dependent on the original phi node),
- // especially in weird CFG's where some arguments are unreachable, or
- // uninitialized along certain paths.
- // This can cause infinite loops during evaluation (even if you disable
- // the recursion below, you will simply ping-pong between congruence
- // classes). If a phi node symbolically evaluates to another phi node,
- // just leave it alone. If they are really the same, we will still
- // eliminate them in favor of each other.
- if (isa<PHINode>(AllSameValue))
- return E;
NumGVNPhisAllSame++;
DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
<< "\n");
@@ -1007,12 +1038,22 @@ void NewGVN::markMemoryUsersTouched(MemoryAccess *MA) {
}
}
+// Touch the instructions that need to be updated after a congruence class has a
+// leader change, and mark changed values.
+void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) {
+ for (auto M : CC->Members) {
+ if (auto *I = dyn_cast<Instruction>(M))
+ TouchedInstructions.set(InstrDFS[I]);
+ ChangedValues.insert(M);
+ }
+}
+
// Perform congruence finding on a given value numbering expression.
void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
-
ValueToExpression[V] = E;
// This is guaranteed to return something, since it will at least find
// INITIAL.
+
CongruenceClass *VClass = ValueToClass[V];
assert(VClass && "Should have found a vclass");
// Dead classes should have been eliminated from the mapping.
@@ -1031,14 +1072,17 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
place->second = NewClass;
// Constants and variables should always be made the leader.
- if (const auto *CE = dyn_cast<ConstantExpression>(E))
+ if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
NewClass->RepLeader = CE->getConstantValue();
- else if (const auto *VE = dyn_cast<VariableExpression>(E))
- NewClass->RepLeader = VE->getVariableValue();
- else if (const auto *SE = dyn_cast<StoreExpression>(E))
- NewClass->RepLeader = SE->getStoreInst()->getValueOperand();
- else
+ } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
+ StoreInst *SI = SE->getStoreInst();
+ NewClass->RepLeader =
+ lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
+ } else {
NewClass->RepLeader = V;
+ }
+ assert(!isa<VariableExpression>(E) &&
+ "VariableExpression should have been handled already");
EClass = NewClass;
DEBUG(dbgs() << "Created new congruence class for " << *V
@@ -1077,14 +1121,11 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
ExpressionToClass.erase(VClass->DefiningExpr);
}
} else if (VClass->RepLeader == V) {
- // FIXME: When the leader changes, the value numbering of
- // everything may change, so we need to reprocess.
+ // When the leader changes, the value numbering of
+ // everything may change due to symbolization changes, so we need to
+ // reprocess.
VClass->RepLeader = *(VClass->Members.begin());
- for (auto M : VClass->Members) {
- if (auto *I = dyn_cast<Instruction>(M))
- TouchedInstructions.set(InstrDFS[I]);
- ChangedValues.insert(M);
- }
+ markLeaderChangeTouched(VClass);
}
}
@@ -1106,6 +1147,27 @@ void NewGVN::performCongruenceFinding(Value *V, const Expression *E) {
markMemoryUsersTouched(MA);
}
}
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(V)) {
+ // There is, sadly, one complicating thing for stores. Stores do not
+ // produce values, only consume them. However, in order to make loads and
+ // stores value number the same, we ignore the value operand of the store.
+ // But the value operand will still be the leader of our class, and thus, it
+ // may change. Because the store is a use, the store will get reprocessed,
+ // but nothing will change about it, and so nothing above will catch it
+ // (since the class will not change). In order to make sure everything ends
+ // up okay, we need to recheck the leader of the class. Since stores of
+ // different values value number differently due to different memorydefs, we
+ // are guaranteed the leader is always the same between stores in the same
+ // class.
+ DEBUG(dbgs() << "Checking store leader\n");
+ auto ProperLeader =
+ lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
+ if (EClass->RepLeader != ProperLeader) {
+ DEBUG(dbgs() << "Store leader changed, fixing\n");
+ EClass->RepLeader = ProperLeader;
+ markLeaderChangeTouched(EClass);
+ markMemoryUsersTouched(MSSA->getMemoryAccess(SI));
+ }
}
}
@@ -1708,8 +1770,9 @@ struct NewGVN::ValueDFS {
}
};
-void NewGVN::convertDenseToDFSOrdered(CongruenceClass::MemberSet &Dense,
- std::vector<ValueDFS> &DFSOrderedSet) {
+void NewGVN::convertDenseToDFSOrdered(
+ CongruenceClass::MemberSet &Dense,
+ SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
for (auto D : Dense) {
// First add the value.
BasicBlock *BB = getBlockForValue(D);
@@ -1972,21 +2035,25 @@ bool NewGVN::eliminateInstructions(Function &F) {
ValueDFSStack EliminationStack;
// Convert the members to DFS ordered sets and then merge them.
- std::vector<ValueDFS> DFSOrderedSet;
+ SmallVector<ValueDFS, 8> DFSOrderedSet;
convertDenseToDFSOrdered(CC->Members, DFSOrderedSet);
// Sort the whole thing.
- sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
-
- for (auto &C : DFSOrderedSet) {
- int MemberDFSIn = C.DFSIn;
- int MemberDFSOut = C.DFSOut;
- Value *Member = C.Val;
- Use *MemberUse = C.U;
-
- // We ignore void things because we can't get a value from them.
- if (Member && Member->getType()->isVoidTy())
- continue;
+ std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
+
+ for (auto &VD : DFSOrderedSet) {
+ int MemberDFSIn = VD.DFSIn;
+ int MemberDFSOut = VD.DFSOut;
+ Value *Member = VD.Val;
+ Use *MemberUse = VD.U;
+
+ if (Member) {
+ // We ignore void things because we can't get a value from them.
+ // FIXME: We could actually use this to kill dead stores that are
+ // dominated by equivalent earlier stores.
+ if (Member->getType()->isVoidTy())
+ continue;
+ }
if (EliminationStack.empty()) {
DEBUG(dbgs() << "Elimination Stack is empty\n");
@@ -1995,8 +2062,6 @@ bool NewGVN::eliminateInstructions(Function &F) {
<< EliminationStack.dfs_back().first << ","
<< EliminationStack.dfs_back().second << ")\n");
}
- if (Member && isa<Constant>(Member))
- assert(isa<Constant>(CC->RepLeader));
DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
<< MemberDFSOut << ")\n");
@@ -2037,11 +2102,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
continue;
Value *Result = EliminationStack.back();
- // Don't replace our existing users with ourselves, and don't replace
- // phi node arguments with the result of the same phi node.
- // IE tmp = phi(tmp11, undef); tmp11 = foo -> tmp = phi(tmp, undef)
- if (MemberUse->get() == Result ||
- (isa<PHINode>(Result) && MemberUse->getUser() == Result))
+ // Don't replace our existing users with ourselves.
+ if (MemberUse->get() == Result)
continue;
DEBUG(dbgs() << "Found replacement " << *Result << " for "
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 8a6be97d08c7..34be90692481 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -511,9 +511,6 @@ private:
void visitSelectInst(SelectInst &I);
void visitBinaryOperator(Instruction &I);
void visitCmpInst(CmpInst &I);
- void visitExtractElementInst(ExtractElementInst &I);
- void visitInsertElementInst(InsertElementInst &I);
- void visitShuffleVectorInst(ShuffleVectorInst &I);
void visitExtractValueInst(ExtractValueInst &EVI);
void visitInsertValueInst(InsertValueInst &IVI);
void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
@@ -970,21 +967,6 @@ void SCCPSolver::visitCmpInst(CmpInst &I) {
markOverdefined(&I);
}
-void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) {
- // TODO : SCCP does not handle vectors properly.
- return markOverdefined(&I);
-}
-
-void SCCPSolver::visitInsertElementInst(InsertElementInst &I) {
- // TODO : SCCP does not handle vectors properly.
- return markOverdefined(&I);
-}
-
-void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) {
- // TODO : SCCP does not handle vectors properly.
- return markOverdefined(&I);
-}
-
// Handle getelementptr instructions. If all operands are constants then we
// can turn this into a getelementptr ConstantExpr.
//
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
index 678d02e05d42..9844190ef84a 100644
--- a/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -67,12 +67,15 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
return true;
}
- // When exporting, consult the index.
- auto Summaries = ImportIndex.findGlobalValueSummaryList(SGV->getGUID());
- assert(Summaries != ImportIndex.end() &&
- "Missing summary for global value when exporting");
- assert(Summaries->second.size() == 1 && "Local has more than one summary");
- auto Linkage = Summaries->second.front()->linkage();
+ // When exporting, consult the index. We can have more than one local
+ // with the same GUID, in the case of same-named locals in different but
+ // same-named source files that were compiled in their respective directories
+ // (so the source file name and resulting GUID is the same). Find the one
+ // in this module.
+ auto Summary = ImportIndex.findSummaryInModule(
+ SGV->getGUID(), SGV->getParent()->getModuleIdentifier());
+ assert(Summary && "Missing summary for global value when exporting");
+ auto Linkage = Summary->linkage();
if (!GlobalValue::isLocalLinkage(Linkage)) {
assert(!isNonRenamableLocal(*SGV) &&
"Attempting to promote non-renamable local");
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index c8f030f7eb83..11d54bcf4f89 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1189,19 +1189,11 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
Value *LibCallSimplifier::optimizeFabs(CallInst *CI, IRBuilder<> &B) {
Function *Callee = CI->getCalledFunction();
- Value *Ret = nullptr;
StringRef Name = Callee->getName();
if (Name == "fabs" && hasFloatVersion(Name))
- Ret = optimizeUnaryDoubleFP(CI, B, false);
+ return optimizeUnaryDoubleFP(CI, B, false);
- Value *Op = CI->getArgOperand(0);
- if (Instruction *I = dyn_cast<Instruction>(Op)) {
- // Fold fabs(x * x) -> x * x; any squared FP value must already be positive.
- if (I->getOpcode() == Instruction::FMul)
- if (I->getOperand(0) == I->getOperand(1))
- return Op;
- }
- return Ret;
+ return nullptr;
}
Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 31daba2248aa..578c65daf7c0 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -783,6 +783,10 @@ protected:
// Similarly, we create a new latch condition when setting up the structure
// of the new loop, so the old one can become dead.
SmallPtrSet<Instruction *, 4> DeadInstructions;
+
+ // Holds the end values for each induction variable. We save the end values
+ // so we can later fix-up the external users of the induction variables.
+ DenseMap<PHINode *, Value *> IVEndValues;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -1879,13 +1883,6 @@ public:
unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
unsigned LoopCost);
- /// \return The most profitable unroll factor.
- /// This method finds the best unroll-factor based on register pressure and
- /// other parameters. VF and LoopCost are the selected vectorization factor
- /// and the cost of the selected VF.
- unsigned computeInterleaveCount(bool OptForSize, unsigned VF,
- unsigned LoopCost);
-
/// \brief A struct that represents some properties of the register usage
/// of a loop.
struct RegisterUsage {
@@ -3424,7 +3421,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
// Create phi nodes to merge from the backedge-taken check block.
PHINode *BCResumeVal = PHINode::Create(
OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator());
- Value *EndValue;
+ Value *&EndValue = IVEndValues[OrigPhi];
if (OrigPhi == OldInduction) {
// We know what the end value is.
EndValue = CountRoundDown;
@@ -3443,9 +3440,6 @@ void InnerLoopVectorizer::createEmptyLoop() {
// or the value at the end of the vectorized loop.
BCResumeVal->addIncoming(EndValue, MiddleBlock);
- // Fix up external users of the induction variable.
- fixupIVUsers(OrigPhi, II, CountRoundDown, EndValue, MiddleBlock);
-
// Fix the scalar body counter (PHI node).
unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
@@ -4116,11 +4110,23 @@ void InnerLoopVectorizer::vectorizeLoop() {
Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
} // end of for each Phi in PHIsToFix.
- fixLCSSAPHIs();
-
- // Make sure DomTree is updated.
+ // Update the dominator tree.
+ //
+ // FIXME: After creating the structure of the new loop, the dominator tree is
+ // no longer up-to-date, and it remains that way until we update it
+ // here. An out-of-date dominator tree is problematic for SCEV,
+ // because SCEVExpander uses it to guide code generation. The
+ // vectorizer use SCEVExpanders in several places. Instead, we should
+ // keep the dominator tree up-to-date as we go.
updateAnalysis();
+ // Fix-up external users of the induction variables.
+ for (auto &Entry : *Legal->getInductionVars())
+ fixupIVUsers(Entry.first, Entry.second,
+ getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
+ IVEndValues[Entry.first], LoopMiddleBlock);
+
+ fixLCSSAPHIs();
predicateInstructions();
// Remove redundant induction instructions.
diff --git a/test/Analysis/CostModel/X86/shuffle-reverse.ll b/test/Analysis/CostModel/X86/shuffle-reverse.ll
index a1bdda0690aa..627d79857434 100644
--- a/test/Analysis/CostModel/X86/shuffle-reverse.ll
+++ b/test/Analysis/CostModel/X86/shuffle-reverse.ll
@@ -161,7 +161,7 @@ define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512)
; AVX1: cost of 8 {{.*}} %V512 = shufflevector
; AVX2: cost of 4 {{.*}} %V512 = shufflevector
; AVX512F: cost of 4 {{.*}} %V512 = shufflevector
- ; AVX512BW: cost of 6 {{.*}} %V512 = shufflevector
+ ; AVX512BW: cost of 2 {{.*}} %V512 = shufflevector
%V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret void
diff --git a/test/Analysis/CostModel/X86/testshiftlshr.ll b/test/Analysis/CostModel/X86/testshiftlshr.ll
index 52f176fe4d63..e5fff9b5e4da 100644
--- a/test/Analysis/CostModel/X86/testshiftlshr.ll
+++ b/test/Analysis/CostModel/X86/testshiftlshr.ll
@@ -498,7 +498,7 @@ entry:
define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
entry:
; SSE2: shift16i8c
- ; SSE2: cost of 1 {{.*}} lshr
+ ; SSE2: cost of 2 {{.*}} lshr
; SSE2-CODEGEN: shift16i8c
; SSE2-CODEGEN: psrlw $3
@@ -513,7 +513,7 @@ entry:
define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
entry:
; SSE2: shift32i8c
- ; SSE2: cost of 2 {{.*}} lshr
+ ; SSE2: cost of 4 {{.*}} lshr
; SSE2-CODEGEN: shift32i8c
; SSE2-CODEGEN: psrlw $3
diff --git a/test/Analysis/CostModel/X86/testshiftshl.ll b/test/Analysis/CostModel/X86/testshiftshl.ll
index e385c5bfeeac..6628b9b87986 100644
--- a/test/Analysis/CostModel/X86/testshiftshl.ll
+++ b/test/Analysis/CostModel/X86/testshiftshl.ll
@@ -498,7 +498,7 @@ entry:
define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
entry:
; SSE2: shift16i8c
- ; SSE2: cost of 1 {{.*}} shl
+ ; SSE2: cost of 2 {{.*}} shl
; SSE2-CODEGEN: shift16i8c
; SSE2-CODEGEN: psllw $3
@@ -513,7 +513,7 @@ entry:
define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
entry:
; SSE2: shift32i8c
- ; SSE2: cost of 2 {{.*}} shl
+ ; SSE2: cost of 4 {{.*}} shl
; SSE2-CODEGEN: shift32i8c
; SSE2-CODEGEN: psllw $3
diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
index 888164df75f5..6756f3ba2802 100644
--- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -120,7 +120,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
; AVX: Found an estimated cost of 56 for instruction: %shift
; AVX2: Found an estimated cost of 20 for instruction: %shift
; AVX512F: Found an estimated cost of 20 for instruction: %shift
-; AVX512BW: Found an estimated cost of 2 for instruction: %shift
+; AVX512BW: Found an estimated cost of 1 for instruction: %shift
; XOP: Found an estimated cost of 8 for instruction: %shift
%shift = ashr <32 x i16> %a, %b
ret <32 x i16> %shift
@@ -282,7 +282,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
; AVX: Found an estimated cost of 56 for instruction: %shift
; AVX2: Found an estimated cost of 20 for instruction: %shift
; AVX512F: Found an estimated cost of 20 for instruction: %shift
-; AVX512BW: Found an estimated cost of 2 for instruction: %shift
+; AVX512BW: Found an estimated cost of 1 for instruction: %shift
; XOP: Found an estimated cost of 8 for instruction: %shift
%splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
%shift = ashr <32 x i16> %a, %splat
@@ -439,7 +439,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
; AVX: Found an estimated cost of 56 for instruction: %shift
; AVX2: Found an estimated cost of 20 for instruction: %shift
; AVX512F: Found an estimated cost of 20 for instruction: %shift
-; AVX512BW: Found an estimated cost of 2 for instruction: %shift
+; AVX512BW: Found an estimated cost of 1 for instruction: %shift
; XOP: Found an estimated cost of 8 for instruction: %shift
%shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <32 x i16> %shift
@@ -529,8 +529,7 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
; AVX: Found an estimated cost of 1 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
-; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
+; XOP: Found an estimated cost of 1 for instruction: %shift
%shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
ret <4 x i32> %shift
}
@@ -568,7 +567,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
; AVX: Found an estimated cost of 1 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 2 for instruction: %shift
+; XOP: Found an estimated cost of 1 for instruction: %shift
%shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <8 x i16> %shift
}
@@ -578,9 +577,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
; AVX: Found an estimated cost of 2 for instruction: %shift
-; AVX2: Found an estimated cost of 10 for instruction: %shift
-; AVX512: Found an estimated cost of 10 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; AVX2: Found an estimated cost of 1 for instruction: %shift
+; AVX512: Found an estimated cost of 1 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <16 x i16> %shift
}
@@ -590,10 +590,11 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
; AVX: Found an estimated cost of 4 for instruction: %shift
-; AVX2: Found an estimated cost of 20 for instruction: %shift
-; AVX512F: Found an estimated cost of 20 for instruction: %shift
-; AVX512BW: Found an estimated cost of 2 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; AVX2: Found an estimated cost of 2 for instruction: %shift
+; AVX512F: Found an estimated cost of 2 for instruction: %shift
+; AVX512BW: Found an estimated cost of 1 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <32 x i16> %shift
}
@@ -605,7 +606,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 4 for instruction: %shift
; AVX512: Found an estimated cost of 4 for instruction: %shift
-; XOP: Found an estimated cost of 2 for instruction: %shift
+; XOP: Found an estimated cost of 4 for instruction: %shift
%shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <16 x i8> %shift
}
@@ -615,9 +616,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
; SSE2: Found an estimated cost of 8 for instruction: %shift
; SSE41: Found an estimated cost of 8 for instruction: %shift
; AVX: Found an estimated cost of 8 for instruction: %shift
-; AVX2: Found an estimated cost of 24 for instruction: %shift
-; AVX512: Found an estimated cost of 24 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; AVX2: Found an estimated cost of 4 for instruction: %shift
+; AVX512: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX2: Found an estimated cost of 4 for instruction: %shift
%shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %shift
}
@@ -627,10 +629,11 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
; SSE2: Found an estimated cost of 16 for instruction: %shift
; SSE41: Found an estimated cost of 16 for instruction: %shift
; AVX: Found an estimated cost of 16 for instruction: %shift
-; AVX2: Found an estimated cost of 48 for instruction: %shift
-; AVX512F: Found an estimated cost of 48 for instruction: %shift
-; AVX512BW: Found an estimated cost of 2 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; AVX2: Found an estimated cost of 8 for instruction: %shift
+; AVX512F: Found an estimated cost of 8 for instruction: %shift
+; AVX512BW: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 16 for instruction: %shift
+; XOPAVX2: Found an estimated cost of 8 for instruction: %shift
%shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <64 x i8> %shift
}
diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
index b3382253739f..63e6db194d52 100644
--- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -123,7 +123,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
; AVX: Found an estimated cost of 56 for instruction: %shift
; AVX2: Found an estimated cost of 20 for instruction: %shift
; AVX512F: Found an estimated cost of 20 for instruction: %shift
-; AVX512BW: Found an estimated cost of 2 for instruction: %shift
+; AVX512BW: Found an estimated cost of 1 for instruction: %shift
; XOP: Found an estimated cost of 8 for instruction: %shift
%shift = lshr <32 x i16> %a, %b
ret <32 x i16> %shift
@@ -287,7 +287,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
; AVX: Found an estimated cost of 56 for instruction: %shift
; AVX2: Found an estimated cost of 20 for instruction: %shift
; AVX512F: Found an estimated cost of 20 for instruction: %shift
-; AVX512BW: Found an estimated cost of 2 for instruction: %shift
+; AVX512BW: Found an estimated cost of 1 for instruction: %shift
; XOP: Found an estimated cost of 8 for instruction: %shift
%splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
%shift = lshr <32 x i16> %a, %splat
@@ -447,7 +447,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
; AVX: Found an estimated cost of 56 for instruction: %shift
; AVX2: Found an estimated cost of 20 for instruction: %shift
; AVX512F: Found an estimated cost of 20 for instruction: %shift
-; AVX512BW: Found an estimated cost of 2 for instruction: %shift
+; AVX512BW: Found an estimated cost of 1 for instruction: %shift
; XOP: Found an estimated cost of 8 for instruction: %shift
%shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
ret <32 x i16> %shift
@@ -501,8 +501,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) {
; AVX: Found an estimated cost of 1 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
-; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
+; XOP: Found an estimated cost of 1 for instruction: %shift
%shift = lshr <2 x i64> %a, <i64 7, i64 7>
ret <2 x i64> %shift
}
@@ -540,8 +539,7 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) {
; AVX: Found an estimated cost of 1 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOPAVX: Found an estimated cost of 2 for instruction: %shift
-; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
+; XOP: Found an estimated cost of 1 for instruction: %shift
%shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
ret <4 x i32> %shift
}
@@ -579,7 +577,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) {
; AVX: Found an estimated cost of 1 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 2 for instruction: %shift
+; XOP: Found an estimated cost of 1 for instruction: %shift
%shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <8 x i16> %shift
}
@@ -589,9 +587,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
; SSE2: Found an estimated cost of 2 for instruction: %shift
; SSE41: Found an estimated cost of 2 for instruction: %shift
; AVX: Found an estimated cost of 2 for instruction: %shift
-; AVX2: Found an estimated cost of 10 for instruction: %shift
-; AVX512: Found an estimated cost of 10 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; AVX2: Found an estimated cost of 1 for instruction: %shift
+; AVX512: Found an estimated cost of 1 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX2: Found an estimated cost of 1 for instruction: %shift
%shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <16 x i16> %shift
}
@@ -601,21 +600,22 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
; SSE2: Found an estimated cost of 4 for instruction: %shift
; SSE41: Found an estimated cost of 4 for instruction: %shift
; AVX: Found an estimated cost of 4 for instruction: %shift
-; AVX2: Found an estimated cost of 20 for instruction: %shift
-; AVX512F: Found an estimated cost of 20 for instruction: %shift
-; AVX512BW: Found an estimated cost of 2 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; AVX2: Found an estimated cost of 2 for instruction: %shift
+; AVX512F: Found an estimated cost of 2 for instruction: %shift
+; AVX512BW: Found an estimated cost of 1 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
ret <32 x i16> %shift
}
define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8':
-; SSE2: Found an estimated cost of 1 for instruction: %shift
-; SSE41: Found an estimated cost of 1 for instruction: %shift
-; AVX: Found an estimated cost of 1 for instruction: %shift
-; AVX2: Found an estimated cost of 1 for instruction: %shift
-; AVX512: Found an estimated cost of 1 for instruction: %shift
+; SSE2: Found an estimated cost of 2 for instruction: %shift
+; SSE41: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX2: Found an estimated cost of 2 for instruction: %shift
+; AVX512: Found an estimated cost of 2 for instruction: %shift
; XOP: Found an estimated cost of 2 for instruction: %shift
%shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <16 x i8> %shift
@@ -623,25 +623,27 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8':
-; SSE2: Found an estimated cost of 2 for instruction: %shift
-; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
-; AVX2: Found an estimated cost of 11 for instruction: %shift
-; AVX512: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; SSE2: Found an estimated cost of 4 for instruction: %shift
+; SSE41: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX2: Found an estimated cost of 2 for instruction: %shift
+; AVX512: Found an estimated cost of 2 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %shift
}
define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
-; SSE2: Found an estimated cost of 4 for instruction: %shift
-; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
-; AVX2: Found an estimated cost of 22 for instruction: %shift
-; AVX512F: Found an estimated cost of 22 for instruction: %shift
+; SSE2: Found an estimated cost of 8 for instruction: %shift
+; SSE41: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
+; AVX2: Found an estimated cost of 4 for instruction: %shift
+; AVX512F: Found an estimated cost of 4 for instruction: %shift
; AVX512BW: Found an estimated cost of 2 for instruction: %shift
-; XOP: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX2: Found an estimated cost of 4 for instruction: %shift
%shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <64 x i8> %shift
}
diff --git a/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
index 804c5a76c319..8c42bd66c707 100644
--- a/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -57,8 +57,8 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i32':
; SSE2: Found an estimated cost of 10 for instruction: %shift
-; SSE41: Found an estimated cost of 10 for instruction: %shift
-; AVX: Found an estimated cost of 10 for instruction: %shift
+; SSE41: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
; XOPAVX: Found an estimated cost of 1 for instruction: %shift
@@ -70,8 +70,8 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32':
; SSE2: Found an estimated cost of 20 for instruction: %shift
-; SSE41: Found an estimated cost of 20 for instruction: %shift
-; AVX: Found an estimated cost of 20 for instruction: %shift
+; SSE41: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
; XOPAVX: Found an estimated cost of 2 for instruction: %shift
@@ -83,8 +83,8 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32':
; SSE2: Found an estimated cost of 40 for instruction: %shift
-; SSE41: Found an estimated cost of 40 for instruction: %shift
-; AVX: Found an estimated cost of 40 for instruction: %shift
+; SSE41: Found an estimated cost of 16 for instruction: %shift
+; AVX: Found an estimated cost of 16 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
; XOPAVX: Found an estimated cost of 4 for instruction: %shift
@@ -124,7 +124,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
; AVX: Found an estimated cost of 56 for instruction: %shift
; AVX2: Found an estimated cost of 20 for instruction: %shift
; AVX512F: Found an estimated cost of 20 for instruction: %shift
-; AVX512BW: Found an estimated cost of 2 for instruction: %shift
+; AVX512BW: Found an estimated cost of 1 for instruction: %shift
; XOP: Found an estimated cost of 4 for instruction: %shift
%shift = shl <32 x i16> %a, %b
ret <32 x i16> %shift
@@ -216,8 +216,8 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) {
define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i32':
; SSE2: Found an estimated cost of 10 for instruction: %shift
-; SSE41: Found an estimated cost of 10 for instruction: %shift
-; AVX: Found an estimated cost of 10 for instruction: %shift
+; SSE41: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
; XOPAVX: Found an estimated cost of 1 for instruction: %shift
@@ -230,8 +230,8 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32':
; SSE2: Found an estimated cost of 20 for instruction: %shift
-; SSE41: Found an estimated cost of 20 for instruction: %shift
-; AVX: Found an estimated cost of 20 for instruction: %shift
+; SSE41: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
; XOPAVX: Found an estimated cost of 2 for instruction: %shift
@@ -244,8 +244,8 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) {
define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) {
; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32':
; SSE2: Found an estimated cost of 40 for instruction: %shift
-; SSE41: Found an estimated cost of 40 for instruction: %shift
-; AVX: Found an estimated cost of 40 for instruction: %shift
+; SSE41: Found an estimated cost of 16 for instruction: %shift
+; AVX: Found an estimated cost of 16 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
; XOPAVX: Found an estimated cost of 4 for instruction: %shift
@@ -288,7 +288,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
; AVX: Found an estimated cost of 56 for instruction: %shift
; AVX2: Found an estimated cost of 20 for instruction: %shift
; AVX512F: Found an estimated cost of 20 for instruction: %shift
-; AVX512BW: Found an estimated cost of 2 for instruction: %shift
+; AVX512BW: Found an estimated cost of 1 for instruction: %shift
; XOP: Found an estimated cost of 4 for instruction: %shift
%splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
%shift = shl <32 x i16> %a, %splat
@@ -449,7 +449,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512F: Found an estimated cost of 2 for instruction: %shift
-; AVX512BW: Found an estimated cost of 2 for instruction: %shift
+; AVX512BW: Found an estimated cost of 1 for instruction: %shift
; XOPAVX: Found an estimated cost of 4 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
@@ -607,7 +607,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512F: Found an estimated cost of 2 for instruction: %shift
-; AVX512BW: Found an estimated cost of 2 for instruction: %shift
+; AVX512BW: Found an estimated cost of 1 for instruction: %shift
; XOPAVX: Found an estimated cost of 4 for instruction: %shift
; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -616,37 +616,39 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i8':
-; SSE2: Found an estimated cost of 1 for instruction: %shift
-; SSE41: Found an estimated cost of 1 for instruction: %shift
-; AVX: Found an estimated cost of 1 for instruction: %shift
-; AVX2: Found an estimated cost of 1 for instruction: %shift
-; AVX512: Found an estimated cost of 1 for instruction: %shift
-; XOP: Found an estimated cost of 1 for instruction: %shift
+; SSE2: Found an estimated cost of 2 for instruction: %shift
+; SSE41: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 2 for instruction: %shift
+; AVX2: Found an estimated cost of 2 for instruction: %shift
+; AVX512: Found an estimated cost of 2 for instruction: %shift
+; XOP: Found an estimated cost of 2 for instruction: %shift
%shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <16 x i8> %shift
}
define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8':
-; SSE2: Found an estimated cost of 2 for instruction: %shift
-; SSE41: Found an estimated cost of 2 for instruction: %shift
-; AVX: Found an estimated cost of 2 for instruction: %shift
-; AVX2: Found an estimated cost of 11 for instruction: %shift
-; AVX512: Found an estimated cost of 11 for instruction: %shift
-; XOP: Found an estimated cost of 2 for instruction: %shift
+; SSE2: Found an estimated cost of 4 for instruction: %shift
+; SSE41: Found an estimated cost of 4 for instruction: %shift
+; AVX: Found an estimated cost of 4 for instruction: %shift
+; AVX2: Found an estimated cost of 2 for instruction: %shift
+; AVX512: Found an estimated cost of 2 for instruction: %shift
+; XOPAVX: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX2: Found an estimated cost of 2 for instruction: %shift
%shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <32 x i8> %shift
}
define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8':
-; SSE2: Found an estimated cost of 4 for instruction: %shift
-; SSE41: Found an estimated cost of 4 for instruction: %shift
-; AVX: Found an estimated cost of 4 for instruction: %shift
-; AVX2: Found an estimated cost of 22 for instruction: %shift
-; AVX512F: Found an estimated cost of 22 for instruction: %shift
+; SSE2: Found an estimated cost of 8 for instruction: %shift
+; SSE41: Found an estimated cost of 8 for instruction: %shift
+; AVX: Found an estimated cost of 8 for instruction: %shift
+; AVX2: Found an estimated cost of 4 for instruction: %shift
+; AVX512F: Found an estimated cost of 4 for instruction: %shift
; AVX512BW: Found an estimated cost of 2 for instruction: %shift
-; XOP: Found an estimated cost of 4 for instruction: %shift
+; XOPAVX: Found an estimated cost of 8 for instruction: %shift
+; XOPAVX2: Found an estimated cost of 4 for instruction: %shift
%shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
ret <64 x i8> %shift
}
diff --git a/test/Analysis/ScalarEvolution/invalidation.ll b/test/Analysis/ScalarEvolution/invalidation.ll
new file mode 100644
index 000000000000..1fcaddb525e6
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/invalidation.ll
@@ -0,0 +1,70 @@
+; Test that SCEV gets invalidated when one of its dependencies is invalidated.
+;
+; Each of the RUNs checks that the pass manager runs SCEV, then invalidates it
+; due to a dependency being invalidated, and then re-urns it. This will
+; directly fail and indicates a failure that would occur later if we ddidn't
+; invalidate SCEV in this way.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; RUN: opt < %s -passes='require<scalar-evolution>,invalidate<assumptions>,print<scalar-evolution>' \
+; RUN: -debug-pass-manager -disable-output 2>&1 \
+; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-AC-INVALIDATE
+;
+; CHECK-AC-INVALIDATE: Running pass: RequireAnalysisPass
+; CHECK-AC-INVALIDATE: Running analysis: ScalarEvolutionAnalysis
+; CHECK-AC-INVALIDATE: Running analysis: AssumptionAnalysis
+; CHECK-AC-INVALIDATE: Running pass: InvalidateAnalysisPass
+; CHECK-AC-INVALIDATE: Invalidating analysis: AssumptionAnalysis
+; CHECK-AC-INVALIDATE: Running pass: ScalarEvolutionPrinterPass
+; CHECK-AC-INVALIDATE: Running analysis: ScalarEvolutionAnalysis
+; CHECK-AC-INVALIDATE: Running analysis: AssumptionAnalysis
+
+; RUN: opt < %s -passes='require<scalar-evolution>,invalidate<domtree>,print<scalar-evolution>' \
+; RUN: -debug-pass-manager -disable-output 2>&1 \
+; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-DT-INVALIDATE
+;
+; CHECK-DT-INVALIDATE: Running pass: RequireAnalysisPass
+; CHECK-DT-INVALIDATE: Running analysis: ScalarEvolutionAnalysis
+; CHECK-DT-INVALIDATE: Running analysis: DominatorTreeAnalysis
+; CHECK-DT-INVALIDATE: Running pass: InvalidateAnalysisPass
+; CHECK-DT-INVALIDATE: Invalidating analysis: DominatorTreeAnalysis
+; CHECK-DT-INVALIDATE: Running pass: ScalarEvolutionPrinterPass
+; CHECK-DT-INVALIDATE: Running analysis: ScalarEvolutionAnalysis
+; CHECK-DT-INVALIDATE: Running analysis: DominatorTreeAnalysis
+
+; RUN: opt < %s -passes='require<scalar-evolution>,invalidate<loops>,print<scalar-evolution>' \
+; RUN: -debug-pass-manager -disable-output 2>&1 \
+; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LI-INVALIDATE
+;
+; CHECK-LI-INVALIDATE: Running pass: RequireAnalysisPass
+; CHECK-LI-INVALIDATE: Running analysis: ScalarEvolutionAnalysis
+; CHECK-LI-INVALIDATE: Running analysis: LoopAnalysis
+; CHECK-LI-INVALIDATE: Running pass: InvalidateAnalysisPass
+; CHECK-LI-INVALIDATE: Invalidating analysis: LoopAnalysis
+; CHECK-LI-INVALIDATE: Running pass: ScalarEvolutionPrinterPass
+; CHECK-LI-INVALIDATE: Running analysis: ScalarEvolutionAnalysis
+; CHECK-LI-INVALIDATE: Running analysis: LoopAnalysis
+
+; This test isn't particularly interesting, its just enough to make sure we
+; actually do some work inside of SCEV so that if we regress here despite the
+; debug pass printing continuing to match, ASan and other tools can catch it.
+define void @test(i32 %n) {
+; CHECK-LABEL: Classifying expressions for: @test
+; CHECK: Loop %loop: backedge-taken count is 14
+; CHECK: Loop %loop: max backedge-taken count is 14
+; CHECK: Loop %loop: Predicated backedge-taken count is 14
+
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.inc, %loop ]
+ %iv.inc = add nsw i32 %iv, 3
+ %becond = icmp ne i32 %iv.inc, 46
+ br i1 %becond, label %loop, label %leave
+
+leave:
+ ret void
+}
diff --git a/test/Analysis/ValueTracking/assume.ll b/test/Analysis/ValueTracking/assume.ll
index 4bffe8ef7909..fe0ee53eb416 100644
--- a/test/Analysis/ValueTracking/assume.ll
+++ b/test/Analysis/ValueTracking/assume.ll
@@ -1,14 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -instcombine -S | FileCheck %s
define i32 @assume_add(i32 %a, i32 %b) {
; CHECK-LABEL: @assume_add(
- %1 = add i32 %a, %b
- %last_two_digits = and i32 %1, 3
- %2 = icmp eq i32 %last_two_digits, 0
- call void @llvm.assume(i1 %2)
- %3 = add i32 %1, 3
-; CHECK: %3 = or i32 %1, 3
- ret i32 %3
+; CHECK-NEXT: [[T1:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT: [[LAST_TWO_DIGITS:%.*]] = and i32 [[T1]], 3
+; CHECK-NEXT: [[T2:%.*]] = icmp eq i32 [[LAST_TWO_DIGITS]], 0
+; CHECK-NEXT: call void @llvm.assume(i1 [[T2]])
+; CHECK-NEXT: [[T3:%.*]] = or i32 [[T1]], 3
+; CHECK-NEXT: ret i32 [[T3]]
+;
+ %t1 = add i32 %a, %b
+ %last_two_digits = and i32 %t1, 3
+ %t2 = icmp eq i32 %last_two_digits, 0
+ call void @llvm.assume(i1 %t2)
+ %t3 = add i32 %t1, 3
+ ret i32 %t3
}
declare void @llvm.assume(i1)
+
diff --git a/test/Bindings/Go/lit.local.cfg b/test/Bindings/Go/lit.local.cfg
index d68d867fb308..a587f88f54aa 100644
--- a/test/Bindings/Go/lit.local.cfg
+++ b/test/Bindings/Go/lit.local.cfg
@@ -6,7 +6,7 @@ import sys
if not 'go' in config.root.llvm_bindings:
config.unsupported = True
-if config.root.include_go_tests != 'ON':
+if not config.root.include_go_tests:
config.unsupported = True
def find_executable(executable, path=None):
diff --git a/test/Bindings/OCaml/lit.local.cfg b/test/Bindings/OCaml/lit.local.cfg
index 7a83ca142808..fd9e1c50e990 100644
--- a/test/Bindings/OCaml/lit.local.cfg
+++ b/test/Bindings/OCaml/lit.local.cfg
@@ -3,5 +3,5 @@ config.suffixes = ['.ml']
if not 'ocaml' in config.root.llvm_bindings:
config.unsupported = True
-if config.root.have_ocaml_ounit not in ('1', 'TRUE'):
+if not config.root.have_ocaml_ounit:
config.unsupported = True
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 635197bc9ddd..c1667049f80f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,6 +1,14 @@
-if(LLVM_BUILD_EXAMPLES)
- set(ENABLE_EXAMPLES 1)
-endif()
+llvm_canonicalize_cmake_booleans(
+ LLVM_TOOL_LTO_BUILD
+ HAVE_OCAMLOPT
+ HAVE_OCAML_OUNIT
+ LLVM_INCLUDE_GO_TESTS
+ LLVM_USE_INTEL_JITEVENTS
+ HAVE_LIBZ
+ HAVE_LIBXAR
+ LLVM_ENABLE_DIA_SDK
+ LLVM_ENABLE_FFI
+ BUILD_SHARED_LIBS)
configure_lit_site_cfg(
${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll
index 628d285141bc..eb79767e62be 100644
--- a/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -137,8 +137,8 @@ define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
; v2i16 is naturally 4 byte aligned
; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
-; TODO: This should use DST, but for some there are redundant MOVs
-; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal
+; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
+; EG: 16
; EG: 16
define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
%load = load <2 x i16>, <2 x i16> addrspace(2)* %in
@@ -153,11 +153,11 @@ define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
; GCN-DAG: s_sext_i32_i16
; v2i16 is naturally 4 byte aligned
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XY, {{T[0-9].[XYZW]}},
; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; TODO: We should also use ASHR instead of LSHR + BFE
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, [[DST]], 0.0, literal
+; TODO: We should use ASHR instead of LSHR + BFE
+; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal
; EG-DAG: 16
; EG-DAG: 16
define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
@@ -167,16 +167,23 @@ define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
ret void
}
-; FUNC-LABEL: {{^}}constant_constant_zextload_v3i16_to_v3i32:
+; FUNC-LABEL: {{^}}constant_zextload_v3i16_to_v3i32:
; GCN: s_load_dwordx2
; v3i16 is naturally 8 byte aligned
-; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1
-; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}},
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}},
+; EG: CF_END
+; EG-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1
+; EG-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1
; TODO: This should use DST, but for some there are redundant MOVs
-; EG: LSHR {{[* ]*}}{{T[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
-; EG: 16
-define void @constant_constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+; EG-DAG: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
+; EG-DAG: 16
+; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal
+; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal
+; EG-DAG: 65535
+; EG-DAG: 65535
+define void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
entry:
%ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
%ext = zext <3 x i16> %ld to <3 x i32>
@@ -184,19 +191,20 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}constant_constant_sextload_v3i16_to_v3i32:
+; FUNC-LABEL: {{^}}constant_sextload_v3i16_to_v3i32:
; GCN: s_load_dwordx2
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9].[XYZW]}},
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9].[XYZW]}},
; v3i16 is naturally 8 byte aligned
-; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1
-; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: ASHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal
+; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[PTR:T[0-9]\.[XYZW]]], 0, #1
+; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1
+; EG-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
; EG-DAG: 16
; EG-DAG: 16
-define void @constant_constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
entry:
%ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
%ext = sext <3 x i16> %ld to <3 x i32>
@@ -204,20 +212,24 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}constant_constant_zextload_v4i16_to_v4i32:
+; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i32:
; GCN: s_load_dwordx2
; GCN-DAG: s_and_b32
; GCN-DAG: s_lshr_b32
; v4i16 is naturally 8 byte aligned
-; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}
+; EG: VTX_READ_64 [[LD:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: This should use LD, but for some there are redundant MOVs
+; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].Y, {{.*\.[XYZW]}}, literal
+; EG-DAG: BFE_UINT {{[* ]*}}[[ST]].W, {{.*\.[XYZW]}}, literal
; EG-DAG: 16
-; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
-; EG-DAG: AND_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
; EG-DAG: 16
-define void @constant_constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+; EG-DAG: AND_INT {{[* ]*}}[[ST]].X, {{T[0-9]\.[XYZW]}}, literal
+; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal
+; EG-DAG: 65535
+; EG-DAG: 65535
+define void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
%load = load <4 x i16>, <4 x i16> addrspace(2)* %in
%ext = zext <4 x i16> %load to <4 x i32>
store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -230,13 +242,14 @@ define void @constant_constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %
; GCN-DAG: s_sext_i32_i16
; v4i16 is naturally 8 byte aligned
-; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EG: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: This should use LD, but for some there are redundant MOVs
+; EG-DAG: BFE_INT {{[* ]*}}[[ST]].X, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Z, {{.*}}, 0.0, literal
; TODO: We should use ASHR instead of LSHR + BFE
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST]].W, {{.*}}, 0.0, literal
; EG-DAG: 16
; EG-DAG: 16
; EG-DAG: 16
@@ -254,24 +267,27 @@ define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
; GCN-DAG: s_lshr_b32
; v8i16 is naturally 16 byte aligned
-; EG: VTX_READ_128 [[DST:T[0-9]\.XYZW]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}},
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}},
+; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: These should use LSHR instead of BFE_UINT
+; TODO: This should use DST, but for some there are redundant MOVs
+; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal
+; EG-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].W, {{.*}}, literal
+; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal
+; EG-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].W, {{.*}}, literal
+; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, literal
+; EG-DAG: AND_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, literal
+; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, literal
+; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, literal
; EG-DAG: 16
; EG-DAG: 16
; EG-DAG: 16
; EG-DAG: 16
+; EG-DAG: 65535
+; EG-DAG: 65535
+; EG-DAG: 65535
+; EG-DAG: 65535
define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
%load = load <8 x i16>, <8 x i16> addrspace(2)* %in
%ext = zext <8 x i16> %load to <8 x i32>
@@ -285,17 +301,19 @@ define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
; GCN-DAG: s_sext_i32_i16
; v8i16 is naturally 16 byte aligned
-; EG: VTX_READ_128 [[DST:T[0-9]\.XYZW]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; TODO: We should use ASHR instead of LSHR + BFE
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}},
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]+.[XYZW]}},
+; EG: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: 4 of these should use ASHR instead of LSHR + BFE_INT
+; TODO: This should use DST, but for some there are redundant MOVs
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].W, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Y, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].W, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, 0.0, literal
+; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, 0.0, literal
; EG-DAG: 16
; EG-DAG: 16
; EG-DAG: 16
@@ -444,7 +462,7 @@ define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(
; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
-; TODO: Why not 15 ?
+; TODO: These could be expanded earlier using ASHR 15
; EG: 31
define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
%a = load i16, i16 addrspace(2)* %in
@@ -468,7 +486,7 @@ define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
-; TODO: Why not 15 ?
+; TODO: These could be expanded earlier using ASHR 15
; EG: 31
define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
%load = load <1 x i16>, <1 x i16> addrspace(2)* %in
diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll
index f398dd32e06d..7bd131e6516c 100644
--- a/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1,8 +1,8 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
@@ -10,7 +10,7 @@
; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}}
; GCN-HSA: flat_load_ushort
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
define void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
entry:
%ld = load i16, i16 addrspace(1)* %in
@@ -22,7 +22,7 @@ entry:
; GCN-NOHSA: buffer_load_dword v
; GCN-HSA: flat_load_dword v
-; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
define void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
entry:
%ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
@@ -34,8 +34,8 @@ entry:
; GCN-NOHSA: buffer_load_dwordx2 v
; GCN-HSA: flat_load_dwordx2 v
-; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
+; EGCM-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
define void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
entry:
%ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
@@ -47,7 +47,7 @@ entry:
; GCN-NOHSA: buffer_load_dwordx2
; GCN-HSA: flat_load_dwordx2
-; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
define void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
entry:
%ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
@@ -59,7 +59,7 @@ entry:
; GCN-NOHSA: buffer_load_dwordx4
; GCN-HSA: flat_load_dwordx4
-; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
define void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
entry:
%ld = load <8 x i16>, <8 x i16> addrspace(1)* %in
@@ -74,8 +74,8 @@ entry:
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
define void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
entry:
%ld = load <16 x i16>, <16 x i16> addrspace(1)* %in
@@ -90,7 +90,7 @@ entry:
; GCN-HSA: flat_load_ushort
; GCN-HSA: flat_store_dword
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
%a = load i16, i16 addrspace(1)* %in
%ext = zext i16 %a to i32
@@ -105,9 +105,9 @@ define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
; GCN-HSA: flat_load_sshort
; GCN-HSA: flat_store_dword
-; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
-; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; EG: 16
+; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
+; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; EGCM: 16
define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
%a = load i16, i16 addrspace(1)* %in
%ext = sext i16 %a to i32
@@ -119,7 +119,7 @@ define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
; GCN-NOHSA: buffer_load_ushort
; GCN-HSA: flat_load_ushort
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
%load = load <1 x i16>, <1 x i16> addrspace(1)* %in
%ext = zext <1 x i16> %load to <1 x i32>
@@ -131,9 +131,9 @@ define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
; GCN-NOHSA: buffer_load_sshort
; GCN-HSA: flat_load_sshort
-; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
-; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
-; EG: 16
+; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
+; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
+; EGCM: 16
define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
%load = load <1 x i16>, <1 x i16> addrspace(1)* %in
%ext = sext <1 x i16> %load to <1 x i32>
@@ -145,10 +145,9 @@ define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
; GCN-NOHSA: buffer_load_dword
; GCN-HSA: flat_load_dword
-; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
-; TODO: This should use DST, but for some there are redundant MOVs
-; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal
-; EG: 16
+; EGCM: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
+; EGCM: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
+; EGCM: 16
define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
%load = load <2 x i16>, <2 x i16> addrspace(1)* %in
%ext = zext <2 x i16> %load to <2 x i32>
@@ -161,13 +160,14 @@ define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
; GCN-HSA: flat_load_dword
-; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; TODO: We should also use ASHR instead of LSHR + BFE
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, 0.0, literal
-; EG-DAG: 16
-; EG-DAG: 16
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XY, {{T[0-9]\.[XYZW]}},
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EGCM: VTX_READ_32 [[DST:T[0-9].[XYZW]]], [[DST]], 0, #1
+; TODO: This should use ASHR instead of LSHR + BFE
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].X, [[DST]], 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV.[XYZW]}}, 0.0, literal
+; EGCM-DAG: 16
+; EGCM-DAG: 16
define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
%load = load <2 x i16>, <2 x i16> addrspace(1)* %in
%ext = sext <2 x i16> %load to <2 x i32>
@@ -175,16 +175,22 @@ define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
ret void
}
-; FUNC-LABEL: {{^}}global_global_zextload_v3i16_to_v3i32:
+; FUNC-LABEL: {{^}}global_zextload_v3i16_to_v3i32:
; GCN-NOHSA: buffer_load_dwordx2
; GCN-HSA: flat_load_dwordx2
-; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1
-; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}},
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}},
+; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}}, 4, #1
; TODO: This should use DST, but for some there are redundant MOVs
-; EG: LSHR {{[* ]*}}{{T[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
-; EG: 16
-define void @global_global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+; EGCM: LSHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
+; EGCM: 16
+; EGCM: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal
+; EGCM: AND_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], literal
+define void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
entry:
%ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
%ext = zext <3 x i16> %ld to <3 x i32>
@@ -192,19 +198,23 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}global_global_sextload_v3i16_to_v3i32:
+; FUNC-LABEL: {{^}}global_sextload_v3i16_to_v3i32:
; GCN-NOHSA: buffer_load_dwordx2
; GCN-HSA: flat_load_dwordx2
-; EG-DAG: VTX_READ_32 [[DST_HI:T[0-9]\.[XYZW]]], [[DST_HI]], 0, #1
-; EG-DAG: VTX_READ_16 [[DST_LO:T[0-9]\.[XYZW]]], [[DST_LO]], 4, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: ASHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, 0.0, literal
-; EG-DAG: 16
-; EG-DAG: 16
-define void @global_global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}}
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].X, {{T[0-9]\.[XYZW]}},
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XY, {{T[0-9]\.[XYZW]}},
+; EGCM-DAG: VTX_READ_32 [[DST_LO:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_16 [[DST_HI:T[0-9]\.[XYZW]]], {{T[0-9].[XYZW]}}, 4, #1
+; TODO: This should use DST, but for some there are redundant MOVs
+; EGCM-DAG: ASHR {{[* ]*}}[[ST_LO]].Y, {{T[0-9]\.[XYZW]}}, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+define void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
entry:
%ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
%ext = sext <3 x i16> %ld to <3 x i32>
@@ -212,19 +222,22 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}global_global_zextload_v4i16_to_v4i32:
+; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i32:
; GCN-NOHSA: buffer_load_dwordx2
; GCN-HSA: flat_load_dwordx2
-; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
-; EG-DAG: 16
-; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
-; EG-DAG: AND_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal
-; EG-DAG: 16
-define void @global_global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EGCM: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: This should use DST, but for some there are redundant MOVs
+; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST]].Y, {{.*}}, literal
+; EGCM-DAG: 16
+; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST]].W, {{.*}}, literal
+; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].X, {{.*}}, literal
+; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{.*}}, literal
+; EGCM-DAG: 16
+define void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in
%ext = zext <4 x i16> %load to <4 x i32>
store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -236,17 +249,19 @@ define void @global_global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out,
; GCN-HSA: flat_load_dwordx2
-; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
+; CM: MEM_RAT_CACHELESS STORE_DWORD [[ST:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EG: MEM_RAT_CACHELESS STORE_RAW [[ST:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EGCM: VTX_READ_64 [[DST:T[0-9]]].XY, {{T[0-9].[XYZW]}}, 0, #1
; TODO: We should use ASHR instead of LSHR + BFE
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
+; TODO: This should use DST, but for some there are redundant MOVs
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].X, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Z, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].W, {{.*}}, 0.0, literal
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in
%ext = sext <4 x i16> %load to <4 x i32>
@@ -258,16 +273,29 @@ define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
; GCN-NOHSA: buffer_load_dwordx4
; GCN-HSA: flat_load_dwordx4
-; EG: VTX_READ_128 [[DST:T[0-9]\.XYZW]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
+; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EGCM: CF_END
+; EGCM: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: These should use LSHR instead of BFE_UINT
+; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].Y, {{.*}}, literal
+; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_LO]].W, {{.*}}, literal
+; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].Y, {{.*}}, literal
+; EGCM-DAG: BFE_UINT {{[* ]*}}[[ST_HI]].W, {{.*}}, literal
+; EGCM-DAG: AND_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, literal
+; EGCM-DAG: AND_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, literal
+; EGCM-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, literal
+; EGCM-DAG: AND_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, literal
+; EGCM-DAG: 65535
+; EGCM-DAG: 65535
+; EGCM-DAG: 65535
+; EGCM-DAG: 65535
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
%load = load <8 x i16>, <8 x i16> addrspace(1)* %in
%ext = zext <8 x i16> %load to <8 x i32>
@@ -279,24 +307,29 @@ define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
; GCN-NOHSA: buffer_load_dwordx4
; GCN-HSA: flat_load_dwordx4
-; EG: VTX_READ_128 [[DST:T[0-9]\.XYZW]], {{T[0-9].[XYZW]}}, 0, #1
-; TODO: These should use DST, but for some there are redundant MOVs
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
-; EG-DAG: 16
+; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_LO:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; CM-DAG: MEM_RAT_CACHELESS STORE_DWORD [[ST_HI:T[0-9]]], {{T[0-9]\.[XYZW]}}
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_LO:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EG-DAG: MEM_RAT_CACHELESS STORE_RAW [[ST_HI:T[0-9]]].XYZW, {{T[0-9]\.[XYZW]}},
+; EGCM: CF_END
+; EGCM: VTX_READ_128 [[DST:T[0-9]]].XYZW, {{T[0-9].[XYZW]}}, 0, #1
+; TODO: These should use ASHR instead of LSHR + BFE_INT
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Y, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].W, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Y, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].W, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].X, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_LO]].Z, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{.*}}, 0.0, literal
+; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].Z, {{.*}}, 0.0, literal
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
+; EGCM-DAG: 16
define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
%load = load <8 x i16>, <8 x i16> addrspace(1)* %in
%ext = sext <8 x i16> %load to <8 x i32>
@@ -311,8 +344,8 @@ define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
%load = load <16 x i16>, <16 x i16> addrspace(1)* %in
%ext = zext <16 x i16> %load to <16 x i32>
@@ -322,8 +355,8 @@ define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
; FUNC-LABEL: {{^}}global_sextload_v16i16_to_v16i32:
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
%load = load <16 x i16>, <16 x i16> addrspace(1)* %in
%ext = sext <16 x i16> %load to <16 x i32>
@@ -342,10 +375,10 @@ define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
%load = load <32 x i16>, <32 x i16> addrspace(1)* %in
%ext = zext <32 x i16> %load to <32 x i32>
@@ -364,10 +397,10 @@ define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
%load = load <32 x i16>, <32 x i16> addrspace(1)* %in
%ext = sext <32 x i16> %load to <32 x i32>
@@ -394,14 +427,14 @@ define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
%load = load <64 x i16>, <64 x i16> addrspace(1)* %in
%ext = zext <64 x i16> %load to <64 x i32>
@@ -411,14 +444,14 @@ define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
; FUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i32:
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
-; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 64, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
+; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
%load = load <64 x i16>, <64 x i16> addrspace(1)* %in
%ext = sext <64 x i16> %load to <64 x i32>
@@ -434,8 +467,8 @@ define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-; EG: MOV {{.*}}, 0.0
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: MOV {{.*}}, 0.0
define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
%a = load i16, i16 addrspace(1)* %in
%ext = zext i16 %a to i64
@@ -458,10 +491,10 @@ define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
-; TODO: Why not 15 ?
-; EG: 31
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
+; TODO: These could be expanded earlier using ASHR 15
+; EGCM: 31
define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
%a = load i16, i16 addrspace(1)* %in
%ext = sext i16 %a to i64
@@ -471,8 +504,8 @@ define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
; FUNC-LABEL: {{^}}global_zextload_v1i16_to_v1i64:
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-; EG: MOV {{.*}}, 0.0
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: MOV {{.*}}, 0.0
define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
%load = load <1 x i16>, <1 x i16> addrspace(1)* %in
%ext = zext <1 x i16> %load to <1 x i64>
@@ -482,10 +515,10 @@ define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
; FUNC-LABEL: {{^}}global_sextload_v1i16_to_v1i64:
-; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
-; TODO: Why not 15 ?
-; EG: 31
+; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
+; TODO: These could be expanded earlier using ASHR 15
+; EGCM: 31
define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
%load = load <1 x i16>, <1 x i16> addrspace(1)* %in
%ext = sext <1 x i16> %load to <1 x i64>
@@ -503,7 +536,7 @@ define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i64:
-; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
%load = load <2 x i16>, <2 x i16> addrspace(1)* %in
%ext = sext <2 x i16> %load to <2 x i64>
@@ -513,7 +546,7 @@ define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i64:
-; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in
%ext = zext <4 x i16> %load to <4 x i64>
@@ -523,7 +556,7 @@ define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i64:
-; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in
%ext = sext <4 x i16> %load to <4 x i64>
@@ -533,7 +566,7 @@ define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i64:
-; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
%load = load <8 x i16>, <8 x i16> addrspace(1)* %in
%ext = zext <8 x i16> %load to <8 x i64>
@@ -543,7 +576,7 @@ define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i64:
-; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
%load = load <8 x i16>, <8 x i16> addrspace(1)* %in
%ext = sext <8 x i16> %load to <8 x i64>
@@ -553,8 +586,8 @@ define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
; FUNC-LABEL: {{^}}global_zextload_v16i16_to_v16i64:
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
%load = load <16 x i16>, <16 x i16> addrspace(1)* %in
%ext = zext <16 x i16> %load to <16 x i64>
@@ -564,8 +597,8 @@ define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
; FUNC-LABEL: {{^}}global_sextload_v16i16_to_v16i64:
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
%load = load <16 x i16>, <16 x i16> addrspace(1)* %in
%ext = sext <16 x i16> %load to <16 x i64>
@@ -575,10 +608,10 @@ define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
; FUNC-LABEL: {{^}}global_zextload_v32i16_to_v32i64:
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
%load = load <32 x i16>, <32 x i16> addrspace(1)* %in
%ext = zext <32 x i16> %load to <32 x i64>
@@ -588,10 +621,10 @@ define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
; FUNC-LABEL: {{^}}global_sextload_v32i16_to_v32i64:
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
-; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
+; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
%load = load <32 x i16>, <32 x i16> addrspace(1)* %in
%ext = sext <32 x i16> %load to <32 x i64>
diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll
index 5d64a152af3c..13d56535303f 100644
--- a/test/CodeGen/AMDGPU/min.ll
+++ b/test/CodeGen/AMDGPU/min.ll
@@ -1,10 +1,9 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-
; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
-; SI: v_min_i32_e32
+; GCN: v_min_i32_e32
; EG: MIN_INT
define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
@@ -17,7 +16,7 @@ define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
}
; FUNC-LABEL: {{^}}s_test_imin_sle_i32:
-; SI: s_min_i32
+; GCN: s_min_i32
; EG: MIN_INT
define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -28,7 +27,7 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
}
; FUNC-LABEL: {{^}}s_test_imin_sle_v1i32:
-; SI: s_min_i32
+; GCN: s_min_i32
; EG: MIN_INT
define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
@@ -39,10 +38,10 @@ define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
}
; FUNC-LABEL: {{^}}s_test_imin_sle_v4i32:
-; SI: s_min_i32
-; SI: s_min_i32
-; SI: s_min_i32
-; SI: s_min_i32
+; GCN: s_min_i32
+; GCN: s_min_i32
+; GCN: s_min_i32
+; GCN: s_min_i32
; EG: MIN_INT
; EG: MIN_INT
@@ -56,11 +55,11 @@ define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <
}
; FUNC-LABEL: {{^}}s_test_imin_sle_i8:
-; SI: s_load_dword
-; SI: s_load_dword
-; SI: s_sext_i32_i8
-; SI: s_sext_i32_i8
-; SI: s_min_i32
+; GCN: s_load_dword
+; GCN: s_load_dword
+; GCN: s_sext_i32_i8
+; GCN: s_sext_i32_i8
+; GCN: s_min_i32
define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind {
%cmp = icmp sle i8 %a, %b
%val = select i1 %cmp, i8 %a, i8 %b
@@ -72,21 +71,26 @@ define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind {
; extloads with mubuf instructions.
; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8:
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
-; SI: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
+; GCN: buffer_load_sbyte
; SI: v_min_i32
; SI: v_min_i32
; SI: v_min_i32
; SI: v_min_i32
-; SI: s_endpgm
+; VI: v_min_i32
+; VI: v_min_i32
+; VI: v_min_i32
+; VI: v_min_i32
+
+; GCN: s_endpgm
; EG: MIN_INT
; EG: MIN_INT
@@ -117,7 +121,7 @@ define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <
}
; FUNC-LABEL: @v_test_imin_slt_i32
-; SI: v_min_i32_e32
+; GCN: v_min_i32_e32
; EG: MIN_INT
define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
@@ -130,7 +134,7 @@ define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
}
; FUNC-LABEL: @s_test_imin_slt_i32
-; SI: s_min_i32
+; GCN: s_min_i32
; EG: MIN_INT
define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -141,8 +145,8 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
}
; FUNC-LABEL: {{^}}s_test_imin_slt_v2i32:
-; SI: s_min_i32
-; SI: s_min_i32
+; GCN: s_min_i32
+; GCN: s_min_i32
; EG: MIN_INT
; EG: MIN_INT
@@ -154,7 +158,7 @@ define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <
}
; FUNC-LABEL: {{^}}s_test_imin_slt_imm_i32:
-; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
@@ -165,7 +169,7 @@ define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
}
; FUNC-LABEL: {{^}}s_test_imin_sle_imm_i32:
-; SI: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
+; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
@@ -176,7 +180,7 @@ define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
}
; FUNC-LABEL: @v_test_umin_ule_i32
-; SI: v_min_u32_e32
+; GCN: v_min_u32_e32
; EG: MIN_UINT
define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
@@ -189,11 +193,11 @@ define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
}
; FUNC-LABEL: @v_test_umin_ule_v3i32
-; SI: v_min_u32_e32
-; SI: v_min_u32_e32
-; SI: v_min_u32_e32
+; GCN: v_min_u32_e32
+; GCN: v_min_u32_e32
+; GCN: v_min_u32_e32
; SI-NOT: v_min_u32_e32
-; SI: s_endpgm
+; GCN: s_endpgm
; EG: MIN_UINT
; EG: MIN_UINT
@@ -207,7 +211,7 @@ define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrs
ret void
}
; FUNC-LABEL: @s_test_umin_ule_i32
-; SI: s_min_u32
+; GCN: s_min_u32
; EG: MIN_UINT
define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -218,7 +222,7 @@ define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
}
; FUNC-LABEL: @v_test_umin_ult_i32
-; SI: v_min_u32_e32
+; GCN: v_min_u32_e32
; EG: MIN_UINT
define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
@@ -231,9 +235,9 @@ define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
}
; FUNC-LABEL: {{^}}v_test_umin_ult_i8:
-; SI: buffer_load_ubyte
-; SI: buffer_load_ubyte
-; SI: v_min_u32_e32
+; GCN: buffer_load_ubyte
+; GCN: buffer_load_ubyte
+; GCN: v_min_u32_e32
; EG: MIN_UINT
define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
@@ -246,7 +250,7 @@ define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i
}
; FUNC-LABEL: @s_test_umin_ult_i32
-; SI: s_min_u32
+; GCN: s_min_u32
; EG: MIN_UINT
define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
@@ -258,10 +262,10 @@ define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
; FUNC-LABEL: @v_test_umin_ult_i32_multi_use
; SI-NOT: v_min
-; SI: v_cmp_lt_u32
+; GCN: v_cmp_lt_u32
; SI-NEXT: v_cndmask_b32
; SI-NOT: v_min
-; SI: s_endpgm
+; GCN: s_endpgm
; EG-NOT: MIN_UINT
define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
@@ -274,9 +278,27 @@ define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace
ret void
}
+; FUNC-LABEL: @v_test_umin_ult_i16_multi_use
+; GCN-NOT: v_min
+; GCN: v_cmp_lt_u32
+; GCN-NEXT: v_cndmask_b32
+; GCN-NOT: v_min
+; GCN: s_endpgm
+
+; EG-NOT: MIN_UINT
+define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+ %a = load i16, i16 addrspace(1)* %aptr, align 2
+ %b = load i16, i16 addrspace(1)* %bptr, align 2
+ %cmp = icmp ult i16 %a, %b
+ %val = select i1 %cmp, i16 %a, i16 %b
+ store i16 %val, i16 addrspace(1)* %out0, align 2
+ store i1 %cmp, i1 addrspace(1)* %out1
+ ret void
+}
+
; FUNC-LABEL: @s_test_umin_ult_v1i32
-; SI: s_min_u32
+; GCN: s_min_u32
; EG: MIN_UINT
define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
@@ -287,14 +309,14 @@ define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
}
; FUNC-LABEL: {{^}}s_test_umin_ult_v8i32:
-; SI: s_min_u32
-; SI: s_min_u32
-; SI: s_min_u32
-; SI: s_min_u32
-; SI: s_min_u32
-; SI: s_min_u32
-; SI: s_min_u32
-; SI: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
+; GCN: s_min_u32
; EG: MIN_UINT
; EG: MIN_UINT
@@ -312,14 +334,14 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <
}
; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16:
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
-; SI: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
+; GCN: v_min_u32
; EG: MIN_UINT
; EG: MIN_UINT
@@ -338,11 +360,11 @@ define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <
; Make sure redundant and removed
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI: buffer_store_dword [[VMIN]]
+; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; GCN: buffer_store_dword [[VMIN]]
; EG: MIN_UINT
define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
@@ -358,11 +380,11 @@ define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i1
; Make sure redundant sign_extend_inreg removed.
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
-; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
-; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
-; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
-; SI: buffer_store_dword [[VMIN]]
+; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
+; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
+; GCN: buffer_store_dword [[VMIN]]
; EG: MIN_INT
define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
@@ -377,7 +399,7 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16
}
; FUNC-LABEL: {{^}}s_test_imin_sle_i16:
-; SI: s_min_i32
+; GCN: s_min_i32
; EG: MIN_INT
define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
@@ -389,7 +411,7 @@ define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwin
; 64 bit
; FUNC-LABEL: {{^}}test_umin_ult_i64
-; SI: s_endpgm
+; GCN: s_endpgm
; EG: MIN_UINT
; EG: MIN_UINT
@@ -401,7 +423,7 @@ define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
}
; FUNC-LABEL: {{^}}test_umin_ule_i64
-; SI: s_endpgm
+; GCN: s_endpgm
; EG: MIN_UINT
; EG: MIN_UINT
@@ -413,7 +435,7 @@ define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
}
; FUNC-LABEL: {{^}}test_imin_slt_i64
-; SI: s_endpgm
+; GCN: s_endpgm
; EG-DAG: MIN_UINT
; EG-DAG: MIN_INT
@@ -425,7 +447,7 @@ define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
}
; FUNC-LABEL: {{^}}test_imin_sle_i64
-; SI: s_endpgm
+; GCN: s_endpgm
; EG-DAG: MIN_UINT
; EG-DAG: MIN_INT
diff --git a/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll b/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
new file mode 100644
index 000000000000..866a4a9191e2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=r600 -mcpu=cypress -start-after safe-stack %s -o - | FileCheck %s
+; Don't crash
+
+; CHECK: MAX_UINT
+define void @test(i64 addrspace(1)* %out) {
+bb:
+ store i64 2, i64 addrspace(1)* %out
+ %tmp = load i64, i64 addrspace(1)* %out
+ br label %jump
+
+jump: ; preds = %bb
+ %tmp1 = icmp ugt i64 %tmp, 4
+ %umax = select i1 %tmp1, i64 %tmp, i64 4
+ store i64 %umax, i64 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/AMDGPU/store-private.ll b/test/CodeGen/AMDGPU/store-private.ll
new file mode 100644
index 000000000000..33d27f24e9cf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/store-private.ll
@@ -0,0 +1,743 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+
+; FUNC-LABEL: {{^}}store_i1:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_byte
+define void @store_i1(i1 addrspace(0)* %out) {
+entry:
+ store i1 true, i1 addrspace(0)* %out
+ ret void
+}
+
+; i8 store
+; FUNC-LABEL: {{^}}store_i8:
+; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x
+; EG-NEXT: 2
+; EG: MOVA_INT * AR.x (MASKED)
+; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x
+
+; IG 0: Get the byte index and truncate the value
+; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG-NEXT: 3(4.203895e-45)
+; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.x
+; EG-NEXT: 255(3.573311e-43)
+
+; EG: NOT_INT
+; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]]
+; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]]
+; TODO: Is the reload necessary?
+; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]]
+; EG: MOV * T(0 + AR.x).X+, [[RES]]
+
+; SI: buffer_store_byte
+
+define void @store_i8(i8 addrspace(0)* %out, i8 %in) {
+entry:
+ store i8 %in, i8 addrspace(0)* %out
+ ret void
+}
+
+; i16 store
+; FUNC-LABEL: {{^}}store_i16:
+; EG: LSHR * [[ADDRESS:T[0-9]\.[XYZW]]], KC0[2].Y, literal.x
+; EG-NEXT: 2
+; EG: MOVA_INT * AR.x (MASKED)
+; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x
+
+; IG 0: Get the byte index and truncate the value
+; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG-NEXT: 3(4.203895e-45)
+; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.x
+; EG-NEXT: 65535(9.183409e-41)
+
+; EG: NOT_INT
+; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]]
+; EG: OR_INT * [[RES:T[0-9]\.[XYZW]]]
+; TODO: Is the reload necessary?
+; EG: MOVA_INT * AR.x (MASKED), [[ADDRESS]]
+; EG: MOV * T(0 + AR.x).X+, [[RES]]
+
+; SI: buffer_store_short
+define void @store_i16(i16 addrspace(0)* %out, i16 %in) {
+entry:
+ store i16 %in, i16 addrspace(0)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_i24:
+; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; SI-DAG: buffer_store_byte
+; SI-DAG: buffer_store_short
+
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store can be eliminated
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store can be eliminated
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+define void @store_i24(i24 addrspace(0)* %out, i24 %in) {
+entry:
+ store i24 %in, i24 addrspace(0)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_i25:
+; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x1ffffff{{$}}
+; SI: v_mov_b32_e32 [[VAND:v[0-9]+]], [[AND]]
+; SI: buffer_store_dword [[VAND]]
+
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG-NOT: MOVA_INT
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM-NOT: MOVA_INT
+define void @store_i25(i25 addrspace(0)* %out, i25 %in) {
+entry:
+ store i25 %in, i25 addrspace(0)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_v2i8:
+; v2i8 is naturally 2B aligned, treat as i16
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG-NOT: MOVA_INT
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM-NOT: MOVA_INT
+
+; SI: buffer_store_short
+define void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
+entry:
+ %0 = trunc <2 x i32> %in to <2 x i8>
+ store <2 x i8> %0, <2 x i8> addrspace(0)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_v2i8_unaligned:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_byte
+define void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
+entry:
+ %0 = trunc <2 x i32> %in to <2 x i8>
+ store <2 x i8> %0, <2 x i8> addrspace(0)* %out, align 1
+ ret void
+}
+
+
+; FUNC-LABEL: {{^}}store_v2i16:
+; v2i8 is naturally 2B aligned, treat as i16
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG-NOT: MOVA_INT
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM-NOT: MOVA_INT
+
+; SI: buffer_store_dword
+define void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
+entry:
+ %0 = trunc <2 x i32> %in to <2 x i16>
+ store <2 x i16> %0, <2 x i16> addrspace(0)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_v2i16_unaligned:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_short
+; SI: buffer_store_short
+define void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
+entry:
+ %0 = trunc <2 x i32> %in to <2 x i16>
+ store <2 x i16> %0, <2 x i16> addrspace(0)* %out, align 2
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i8:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG-NOT: MOVA_INT
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM-NOT: MOVA_INT
+
+; SI: buffer_store_dword
+define void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+entry:
+ %0 = trunc <4 x i32> %in to <4 x i8>
+ store <4 x i8> %0, <4 x i8> addrspace(0)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i8_unaligned:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI-NOT: buffer_store_dword
+define void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+entry:
+ %0 = trunc <4 x i32> %in to <4 x i8>
+ store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 1
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_v8i8_unaligned:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI: buffer_store_byte
+; SI-NOT: buffer_store_dword
+define void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) {
+entry:
+ %0 = trunc <8 x i32> %in to <8 x i8>
+ store <8 x i8> %0, <8 x i8> addrspace(0)* %out, align 1
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i8_halfaligned:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; TODO: This load and store cannot be eliminated,
+; they might be different locations
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_short
+; SI: buffer_store_short
+; SI-NOT: buffer_store_dword
+define void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+entry:
+ %0 = trunc <4 x i32> %in to <4 x i8>
+ store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 2
+ ret void
+}
+
+; floating-point store
+; FUNC-LABEL: {{^}}store_f32:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_dword
+
+define void @store_f32(float addrspace(0)* %out, float %in) {
+ store float %in, float addrspace(0)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i16:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x2?
+; XSI: buffer_store_dwordx2
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) {
+entry:
+ %0 = trunc <4 x i32> %in to <4 x i16>
+ store <4 x i16> %0, <4 x i16> addrspace(0)* %out
+ ret void
+}
+
+; vec2 floating-point stores
+; FUNC-LABEL: {{^}}store_v2f32:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x2?
+; XSI: buffer_store_dwordx2
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+
+define void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) {
+entry:
+ %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
+ %1 = insertelement <2 x float> %0, float %b, i32 1
+ store <2 x float> %1, <2 x float> addrspace(0)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_v3i32:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x2?
+; XSI-DAG: buffer_store_dwordx2
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+
+define void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind {
+ store <3 x i32> %a, <3 x i32> addrspace(0)* %out, align 16
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i32:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x4?
+; XSI: buffer_store_dwordx4
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
+entry:
+ store <4 x i32> %in, <4 x i32> addrspace(0)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_v4i32_unaligned:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x4?
+; XSI: buffer_store_dwordx4
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
+entry:
+ store <4 x i32> %in, <4 x i32> addrspace(0)* %out, align 4
+ ret void
+}
+
+; v4f32 store
+; FUNC-LABEL: {{^}}store_v4f32:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x4?
+; XSI: buffer_store_dwordx4
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) {
+ %1 = load <4 x float>, <4 x float> addrspace(0) * %in
+ store <4 x float> %1, <4 x float> addrspace(0)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_i64_i8:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_byte
+define void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) {
+entry:
+ %0 = trunc i64 %in to i8
+ store i8 %0, i8 addrspace(0)* %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}store_i64_i16:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; SI: buffer_store_short
+define void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) {
+entry:
+ %0 = trunc i64 %in to i16
+ store i16 %0, i16 addrspace(0)* %out
+ ret void
+}
+
+; The stores in this function are combined by the optimizer to create a
+; 64-bit store with 32-bit alignment. This is legal and the legalizer
+; should not try to split the 64-bit store back into 2 32-bit stores.
+
+; FUNC-LABEL: {{^}}vecload2:
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x2?
+; XSI: buffer_store_dwordx2
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+entry:
+ %0 = load i32, i32 addrspace(2)* %mem, align 4
+ %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
+ %1 = load i32, i32 addrspace(2)* %arrayidx1.i, align 4
+ store i32 %0, i32 addrspace(0)* %out, align 4
+ %arrayidx1 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1
+ store i32 %1, i32 addrspace(0)* %arrayidx1, align 4
+ ret void
+}
+
+; When i128 was a legal type this program generated cannot select errors:
+
+; FUNC-LABEL: {{^}}"i128-const-store":
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; EG: MOVA_INT
+; EG: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+; CM: MOVA_INT
+; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
+
+;TODO: why not x4?
+; XSI: buffer_store_dwordx4
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+; SI: buffer_store_dword
+define void @i128-const-store(i32 addrspace(0)* %out) {
+entry:
+ store i32 1, i32 addrspace(0)* %out, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1
+ store i32 1, i32 addrspace(0)* %arrayidx2, align 4
+ %arrayidx4 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 2
+ store i32 2, i32 addrspace(0)* %arrayidx4, align 4
+ %arrayidx6 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 3
+ store i32 2, i32 addrspace(0)* %arrayidx6, align 4
+ ret void
+}
+
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AVR/intrinsics/read_register.ll b/test/CodeGen/AVR/intrinsics/read_register.ll
new file mode 100644
index 000000000000..3f28d1d3a9fe
--- /dev/null
+++ b/test/CodeGen/AVR/intrinsics/read_register.ll
@@ -0,0 +1,17 @@
+; RUN: llc -O0 < %s -march=avr | FileCheck %s
+
+; CHECK-LABEL: foo
+define void @foo() {
+entry:
+ %val1 = call i16 @llvm.read_register.i16(metadata !0)
+ %val2 = call i16 @llvm.read_register.i16(metadata !1)
+ %val3 = call i8 @llvm.read_register.i8(metadata !2)
+ ret void
+}
+
+declare i8 @llvm.read_register.i8(metadata)
+declare i16 @llvm.read_register.i16(metadata)
+
+!0 = !{!"r28"}
+!1 = !{!"Z"}
+!2 = !{!"r0"}
diff --git a/test/CodeGen/WebAssembly/function-bitcasts.ll b/test/CodeGen/WebAssembly/function-bitcasts.ll
new file mode 100644
index 000000000000..49980da6eb8f
--- /dev/null
+++ b/test/CodeGen/WebAssembly/function-bitcasts.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that function pointer casts are replaced with wrappers.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: test:
+; CHECK-NEXT: call .Lbitcast@FUNCTION{{$}}
+; CHECK-NEXT: call .Lbitcast.1@FUNCTION{{$}}
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 0
+; CHECK-NEXT: call .Lbitcast.2@FUNCTION, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.call $drop=, .Lbitcast.3@FUNCTION{{$}}
+; CHECK-NEXT: call foo2@FUNCTION{{$}}
+; CHECK-NEXT: call foo3@FUNCTION{{$}}
+; CHECK-NEXT: .endfunc
+
+; CHECK-LABEL: .Lbitcast:
+; CHECK-NEXT: .local i32
+; CHECK-NEXT: call has_i32_arg@FUNCTION, $0{{$}}
+; CHECK-NEXT: .endfunc
+
+; CHECK-LABEL: .Lbitcast.1:
+; CHECK-NEXT: call $drop=, has_i32_ret@FUNCTION{{$}}
+; CHECK-NEXT: .endfunc
+
+; CHECK-LABEL: .Lbitcast.2:
+; CHECK-NEXT: .param i32
+; CHECK-NEXT: call foo0@FUNCTION{{$}}
+; CHECK-NEXT: .endfunc
+
+; CHECK-LABEL: .Lbitcast.3:
+; CHECK-NEXT: .result i32
+; CHECK-NEXT: .local i32
+; CHECK-NEXT: call foo1@FUNCTION{{$}}
+; CHECK-NEXT: copy_local $push0=, $0
+; CHECK-NEXT: .endfunc
+
+declare void @has_i32_arg(i32)
+declare i32 @has_i32_ret()
+
+declare void @foo0()
+declare void @foo1()
+declare void @foo2()
+declare void @foo3()
+
+define void @test() {
+entry:
+ call void bitcast (void (i32)* @has_i32_arg to void ()*)()
+ call void bitcast (i32 ()* @has_i32_ret to void ()*)()
+ call void bitcast (void ()* @foo0 to void (i32)*)(i32 0)
+ %t = call i32 bitcast (void ()* @foo1 to i32 ()*)()
+ call void bitcast (void ()* @foo2 to void ()*)()
+ call void @foo3()
+ ret void
+}
diff --git a/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll b/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
new file mode 100644
index 000000000000..ef4318ec299b
--- /dev/null
+++ b/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -asm-verbose=false | FileCheck %s
+
+; Test that function pointer casts that require conversions are not converted
+; to wrappers. In theory some conversions could be supported, but currently no
+; conversions are implemented.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+; CHECK-LABEL: test:
+; CHECK-NEXT: i32.const $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: call has_i64_arg@FUNCTION, $pop[[L0]]{{$}}
+; CHECK-NEXT: i32.call $drop=, has_i64_ret@FUNCTION{{$}}
+; CHECK-NEXT: .endfunc
+
+; CHECK-NOT: .Lbitcast
+
+declare void @has_i64_arg(i64)
+declare i64 @has_i64_ret()
+
+define void @test() {
+entry:
+ call void bitcast (void (i64)* @has_i64_arg to void (i32)*)(i32 0)
+ %t = call i32 bitcast (i64 ()* @has_i64_ret to i32 ()*)()
+ ret void
+}
diff --git a/test/CodeGen/X86/avx2-arith.ll b/test/CodeGen/X86/avx2-arith.ll
index e1341624cad3..aec74424b9b2 100644
--- a/test/CodeGen/X86/avx2-arith.ll
+++ b/test/CodeGen/X86/avx2-arith.ll
@@ -142,17 +142,108 @@ define <16 x i16> @test_vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone
ret <16 x i16> %x
}
-define <16 x i8> @mul-v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
+define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
+; X32-LABEL: mul_v16i8:
+; X32: ## BB#0:
+; X32-NEXT: vpmovsxbw %xmm1, %ymm1
+; X32-NEXT: vpmovsxbw %xmm0, %ymm0
+; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; X32-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X32-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X32-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; X32-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: mul_v16i8:
+; X64: ## BB#0:
+; X64-NEXT: vpmovsxbw %xmm1, %ymm1
+; X64-NEXT: vpmovsxbw %xmm0, %ymm0
+; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X64-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; X64-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
%x = mul <16 x i8> %i, %j
ret <16 x i8> %x
}
-define <32 x i8> @mul-v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
+; X32-LABEL: mul_v32i8:
+; X32: ## BB#0:
+; X32-NEXT: vextracti128 $1, %ymm1, %xmm2
+; X32-NEXT: vpmovsxbw %xmm2, %ymm2
+; X32-NEXT: vextracti128 $1, %ymm0, %xmm3
+; X32-NEXT: vpmovsxbw %xmm3, %ymm3
+; X32-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; X32-NEXT: vextracti128 $1, %ymm2, %xmm3
+; X32-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X32-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; X32-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; X32-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-NEXT: vpmovsxbw %xmm1, %ymm1
+; X32-NEXT: vpmovsxbw %xmm0, %ymm0
+; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; X32-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X32-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; X32-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: mul_v32i8:
+; X64: ## BB#0:
+; X64-NEXT: vextracti128 $1, %ymm1, %xmm2
+; X64-NEXT: vpmovsxbw %xmm2, %ymm2
+; X64-NEXT: vextracti128 $1, %ymm0, %xmm3
+; X64-NEXT: vpmovsxbw %xmm3, %ymm3
+; X64-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; X64-NEXT: vextracti128 $1, %ymm2, %xmm3
+; X64-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X64-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; X64-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; X64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-NEXT: vpmovsxbw %xmm1, %ymm1
+; X64-NEXT: vpmovsxbw %xmm0, %ymm0
+; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; X64-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X64-NEXT: retq
%x = mul <32 x i8> %i, %j
ret <32 x i8> %x
}
-define <4 x i64> @mul-v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
+; X32-LABEL: mul_v4i64:
+; X32: ## BB#0:
+; X32-NEXT: vpsrlq $32, %ymm0, %ymm2
+; X32-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
+; X32-NEXT: vpsrlq $32, %ymm1, %ymm3
+; X32-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; X32-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; X32-NEXT: vpsllq $32, %ymm2, %ymm2
+; X32-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; X32-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: mul_v4i64:
+; X64: ## BB#0:
+; X64-NEXT: vpsrlq $32, %ymm0, %ymm2
+; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
+; X64-NEXT: vpsrlq $32, %ymm1, %ymm3
+; X64-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2
+; X64-NEXT: vpsllq $32, %ymm2, %ymm2
+; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; X64-NEXT: retq
%x = mul <4 x i64> %i, %j
ret <4 x i64> %x
}
@@ -291,8 +382,8 @@ define <8 x i32> @mul_const9(<8 x i32> %x) {
ret <8 x i32> %y
}
+; %x * 0x01010101
define <4 x i32> @mul_const10(<4 x i32> %x) {
- ; %x * 0x01010101
; X32-LABEL: mul_const10:
; X32: ## BB#0:
; X32-NEXT: vpbroadcastd LCPI22_0, %xmm1
@@ -308,8 +399,8 @@ define <4 x i32> @mul_const10(<4 x i32> %x) {
ret <4 x i32> %m
}
+; %x * 0x80808080
define <4 x i32> @mul_const11(<4 x i32> %x) {
- ; %x * 0x80808080
; X32-LABEL: mul_const11:
; X32: ## BB#0:
; X32-NEXT: vpbroadcastd LCPI23_0, %xmm1
diff --git a/test/CodeGen/X86/avx512-bugfix-23634.ll b/test/CodeGen/X86/avx512-bugfix-23634.ll
index 0dcfb7c169f3..e66eefdb8e9f 100644
--- a/test/CodeGen/X86/avx512-bugfix-23634.ll
+++ b/test/CodeGen/X86/avx512-bugfix-23634.ll
@@ -15,7 +15,7 @@ define void @f_fu(float* %ret, float* %aa, float %b) {
; CHECK-NEXT: vpsrad $1, %zmm2, %zmm2
; CHECK-NEXT: movw $-21846, %ax ## imm = 0xAAAA
; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: vpblendmd {{.*}}(%rip), %zmm1, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa32 {{.*}}(%rip), %zmm1 {%k1}
; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0
diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll
index 532678ae72fa..1a91bc1dee9a 100644
--- a/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/test/CodeGen/X86/avx512-calling-conv.ll
@@ -25,8 +25,7 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -48,8 +47,7 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0
; KNL_X32-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_X32-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
-; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0
; KNL_X32-NEXT: retl
%c = and <16 x i1>%a, %b
@@ -65,8 +63,7 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -88,8 +85,7 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
; KNL_X32-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
; KNL_X32-NEXT: retl
%c = and <8 x i1>%a, %b
@@ -180,8 +176,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
; KNL-NEXT: Lcfi1:
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: callq _func16xi1
; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -210,8 +205,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
; KNL_X32-NEXT: Lcfi1:
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
-; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0
; KNL_X32-NEXT: calll _func16xi1
; KNL_X32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -285,8 +279,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; KNL-NEXT: movb $85, %al
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: popq %rax
; KNL-NEXT: retq
@@ -322,8 +315,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; KNL_X32-NEXT: movb $85, %al
; KNL_X32-NEXT: kmovw %eax, %k1
; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
-; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
; KNL_X32-NEXT: addl $12, %esp
; KNL_X32-NEXT: retl
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index c2eb19d16650..5e50a3aef2f2 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -740,8 +740,7 @@ define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
; KNL: ## BB#0:
; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vcvtdq2ps %zmm0, %zmm0
; KNL-NEXT: retq
;
@@ -805,11 +804,10 @@ define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
; KNL-NEXT: vcmpltpd %zmm1, %zmm2, %k1
; KNL-NEXT: vcmpltpd %zmm0, %zmm2, %k2
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
-; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} {z}
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0
-; KNL-NEXT: vmovdqa64 %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovqd %zmm1, %ymm1
; KNL-NEXT: vcvtdq2pd %ymm1, %zmm1
; KNL-NEXT: retq
@@ -834,8 +832,7 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
; KNL: ## BB#0:
; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0
; KNL-NEXT: retq
@@ -858,8 +855,7 @@ define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1
; KNL-NEXT: vcmpltps %zmm0, %zmm1, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vcvtdq2ps %ymm0, %ymm0
; KNL-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
index 32bd0804d637..03d6127ae5dc 100644
--- a/test/CodeGen/X86/avx512-ext.ll
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -345,9 +345,9 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
@@ -369,9 +369,9 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovsxbd (%rdi), %ymm0
-; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: vpmovsxbd (%rdi), %ymm1
+; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
@@ -704,9 +704,9 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
@@ -728,9 +728,9 @@ define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounw
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vpmovsxwd (%rdi), %ymm0
-; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: vpmovsxwd (%rdi), %ymm1
+; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
@@ -762,9 +762,9 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
@@ -1457,8 +1457,7 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
; KNL-LABEL: sext_16i1_16i32:
; KNL: ## BB#0:
; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: sext_16i1_16i32:
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 26d14fa0840f..cb8ed0e59a3a 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -365,11 +365,10 @@ define i16 @test16(i1 *%addr, i16 %a) {
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kmovw %esi, %k2
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z}
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15]
-; KNL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; KNL-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
@@ -402,11 +401,10 @@ define i8 @test17(i1 *%addr, i8 %a) {
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kmovw %esi, %k2
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z}
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7]
-; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
@@ -1242,30 +1240,29 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
; KNL-NEXT: vpextrd $1, %xmm0, %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: kmovw %eax, %k2
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
-; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} {z}
+; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vmovd %xmm0, %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: kmovw %eax, %k2
-; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} {z}
-; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,2,3,4,5,6,7]
-; KNL-NEXT: vpermi2q %zmm2, %zmm3, %zmm4
-; KNL-NEXT: vpsllq $63, %zmm4, %zmm2
-; KNL-NEXT: vptestmq %zmm2, %zmm2, %k2
-; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} {z}
-; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} {z}
-; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,5,6,7]
-; KNL-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
-; KNL-NEXT: vpsllq $63, %zmm4, %zmm2
-; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1
-; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; KNL-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; KNL-NEXT: vpsllq $63, %zmm3, %zmm1
+; KNL-NEXT: vptestmq %zmm1, %zmm1, %k2
+; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; KNL-NEXT: vpsllq $63, %zmm3, %zmm1
+; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpextrd $3, %xmm0, %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: kmovw %eax, %k1
-; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; KNL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,8,4,5,6,7]
-; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm1
-; KNL-NEXT: vpsllq $63, %zmm1, %zmm0
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: retq
@@ -1306,11 +1303,10 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y)
; KNL-NEXT: vmovq %xmm0, %rax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: kmovw %eax, %k2
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z}
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index d48f63536e0e..b127585dc87b 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -344,8 +344,7 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; KNL-NEXT: LBB17_1:
; KNL-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
; KNL-NEXT: LBB17_3:
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -382,8 +381,7 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
; KNL-NEXT: LBB18_3:
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -472,8 +470,7 @@ define <16 x i1> @test15(i32 %x, i32 %y) {
; KNL-NEXT: movw $1, %cx
; KNL-NEXT: cmovgw %ax, %cx
; KNL-NEXT: kmovw %ecx, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -510,28 +507,27 @@ define <64 x i8> @test16(i64 %x) {
; KNL-NEXT: movl %edi, (%rsp)
; KNL-NEXT: shrq $32, %rdi
; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; KNL-NEXT: kmovw (%rsp), %k1
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} {z}
-; KNL-NEXT: vpmovdb %zmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm2
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; KNL-NEXT: movl $1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
+; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1
-; KNL-NEXT: vpsllw $7, %ymm2, %ymm0
-; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
@@ -574,30 +570,29 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
; KNL-NEXT: movl %edi, (%rsp)
; KNL-NEXT: shrq $32, %rdi
; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; KNL-NEXT: kmovw (%rsp), %k1
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} {z}
-; KNL-NEXT: vpmovdb %zmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; KNL-NEXT: xorl %eax, %eax
; KNL-NEXT: cmpl %edx, %esi
; KNL-NEXT: setg %al
; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z}
-; KNL-NEXT: vpmovdb %zmm2, %xmm2
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
@@ -635,18 +630,17 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kshiftlw $6, %k2, %k2
; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z}
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} {z}
-; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7]
-; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; KNL-NEXT: vpsllq $63, %zmm3, %zmm1
-; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
+; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL-NEXT: kshiftlw $1, %k1, %k1
; KNL-NEXT: kshiftrw $1, %k1, %k1
; KNL-NEXT: kshiftlw $7, %k0, %k0
; KNL-NEXT: korw %k0, %k1, %k1
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqw %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -1387,8 +1381,7 @@ define <8 x i64> @load_8i1(<8 x i1>* %a) {
; KNL: ## BB#0:
; KNL-NEXT: movzbl (%rdi), %eax
; KNL-NEXT: kmovw %eax, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: load_8i1:
@@ -1405,8 +1398,7 @@ define <16 x i32> @load_16i1(<16 x i1>* %a) {
; KNL-LABEL: load_16i1:
; KNL: ## BB#0:
; KNL-NEXT: kmovw (%rdi), %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: load_16i1:
@@ -1424,8 +1416,7 @@ define <2 x i16> @load_2i1(<2 x i1>* %a) {
; KNL: ## BB#0:
; KNL-NEXT: movzbl (%rdi), %eax
; KNL-NEXT: kmovw %eax, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
;
@@ -1444,8 +1435,7 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) {
; KNL: ## BB#0:
; KNL-NEXT: movzbl (%rdi), %eax
; KNL-NEXT: kmovw %eax, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; KNL-NEXT: retq
@@ -1465,10 +1455,9 @@ define <32 x i16> @load_32i1(<32 x i1>* %a) {
; KNL: ## BB#0:
; KNL-NEXT: kmovw (%rdi), %k1
; KNL-NEXT: kmovw 2(%rdi), %k2
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdw %zmm1, %ymm1
; KNL-NEXT: retq
;
@@ -1489,17 +1478,16 @@ define <64 x i8> @load_64i1(<64 x i1>* %a) {
; KNL-NEXT: kmovw 2(%rdi), %k2
; KNL-NEXT: kmovw 4(%rdi), %k3
; KNL-NEXT: kmovw 6(%rdi), %k4
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} {z}
-; KNL-NEXT: vpmovdb %zmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} {z}
-; KNL-NEXT: vpmovdb %zmm2, %xmm2
-; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k4} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
+; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k4} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: load_64i1:
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index 2a0de05608b4..9234ae838cff 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -313,7 +313,7 @@ define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
-; CHECK-NEXT: vpblendmd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x64,0x07]
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i32>*
@@ -327,7 +327,7 @@ define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
-; CHECK-NEXT: vpblendmd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x64,0x07]
+; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i32>*
@@ -369,7 +369,7 @@ define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
-; CHECK-NEXT: vpblendmq (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x64,0x07]
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i64>*
@@ -383,7 +383,7 @@ define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
-; CHECK-NEXT: vpblendmq (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x64,0x07]
+; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i64>*
@@ -426,7 +426,7 @@ define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1)
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
; CHECK-NEXT: vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07]
; CHECK-NEXT: vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04]
-; CHECK-NEXT: vblendmps (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x65,0x07]
+; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x float>*
@@ -441,7 +441,7 @@ define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1)
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
; CHECK-NEXT: vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07]
; CHECK-NEXT: vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04]
-; CHECK-NEXT: vblendmps (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x65,0x07]
+; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x float>*
@@ -486,7 +486,7 @@ define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1)
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
; CHECK-NEXT: vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07]
; CHECK-NEXT: vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04]
-; CHECK-NEXT: vblendmpd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x65,0x07]
+; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x double>*
@@ -501,7 +501,7 @@ define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1)
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
; CHECK-NEXT: vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07]
; CHECK-NEXT: vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04]
-; CHECK-NEXT: vblendmpd (%rdi), %zmm0, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x65,0x07]
+; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x double>*
diff --git a/test/CodeGen/X86/avx512-regcall-NoMask.ll b/test/CodeGen/X86/avx512-regcall-NoMask.ll
index ce8fca036c91..a29c1e4628a1 100644
--- a/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -325,11 +325,13 @@ define x86_regcallcc [4 x i32]* @test_CallargRetPointer([4 x i32]* %a) {
}
; X32-LABEL: test_argRet128Vector:
-; X32: vpblend{{.*}} %xmm0, %xmm1, %xmm0
+; X32: vmovdqa{{.*}} %xmm0, %xmm1
+; X32: vmovdqa{{.*}} %xmm1, %xmm0
; X32: ret{{.*}}
; WIN64-LABEL: test_argRet128Vector:
-; WIN64: vpblend{{.*}} %xmm0, %xmm1, %xmm0
+; WIN64: vmovdqa{{.*}} %xmm0, %xmm1
+; WIN64: vmovdqa{{.*}} %xmm1, %xmm0
; WIN64: ret{{.*}}
; Test regcall when receiving/returning 128 bit vector
@@ -341,13 +343,13 @@ define x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i32> %a, <4 x i32> %b)
; X32-LABEL: test_CallargRet128Vector:
; X32: vmov{{.*}} %xmm0, {{%xmm([0-7])}}
; X32: call{{.*}} {{.*}}test_argRet128Vector
-; X32: vpblend{{.*}} {{%xmm([0-7])}}, %xmm0, %xmm0
+; X32: vmovdqa{{.*}} {{%xmm([0-7])}}, %xmm0
; X32: ret{{.*}}
; WIN64-LABEL: test_CallargRet128Vector:
; WIN64: vmov{{.*}} %xmm0, {{%xmm([0-9]+)}}
; WIN64: call{{.*}} {{.*}}test_argRet128Vector
-; WIN64: vpblend{{.*}} {{%xmm([0-9]+)}}, %xmm0, %xmm0
+; WIN64: vmovdqa{{.*}} {{%xmm([0-9]+)}}, %xmm0
; WIN64: ret{{.*}}
; Test regcall when passing/retrieving 128 bit vector
@@ -358,11 +360,13 @@ define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a) {
}
; X32-LABEL: test_argRet256Vector:
-; X32: vpblend{{.*}} %ymm0, %ymm1, %ymm0
+; X32: vmovdqa{{.*}} %ymm0, %ymm1
+; X32: vmovdqa{{.*}} %ymm1, %ymm0
; X32: ret{{.*}}
; WIN64-LABEL: test_argRet256Vector:
-; WIN64: vpblend{{.*}} %ymm0, %ymm1, %ymm0
+; WIN64: vmovdqa{{.*}} %ymm0, %ymm1
+; WIN64: vmovdqa{{.*}} %ymm1, %ymm0
; WIN64: ret{{.*}}
; Test regcall when receiving/returning 256 bit vector
@@ -374,13 +378,13 @@ define x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i32> %a, <8 x i32> %b)
; X32-LABEL: test_CallargRet256Vector:
; X32: vmov{{.*}} %ymm0, %ymm1
; X32: call{{.*}} {{.*}}test_argRet256Vector
-; X32: vpblend{{.*}} %ymm1, %ymm0, %ymm0
+; X32: vmovdqa{{.*}} %ymm1, %ymm0
; X32: ret{{.*}}
; WIN64-LABEL: test_CallargRet256Vector:
; WIN64: vmov{{.*}} %ymm0, %ymm1
; WIN64: call{{.*}} {{.*}}test_argRet256Vector
-; WIN64: vpblend{{.*}} %ymm1, %ymm0, %ymm0
+; WIN64: vmovdqa{{.*}} %ymm1, %ymm0
; WIN64: ret{{.*}}
; Test regcall when passing/retrieving 256 bit vector
@@ -391,11 +395,13 @@ define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a) {
}
; X32-LABEL: test_argRet512Vector:
-; X32: vpblend{{.*}} %zmm0, %zmm1, %zmm0
+; X32: vmovdqa{{.*}} %zmm0, %zmm1
+; X32: vmovdqa{{.*}} %zmm1, %zmm0
; X32: ret{{.*}}
; WIN64-LABEL: test_argRet512Vector:
-; WIN64: vpblend{{.*}} %zmm0, %zmm1, %zmm0
+; WIN64: vmovdqa{{.*}} %zmm0, %zmm1
+; WIN64: vmovdqa{{.*}} %zmm1, %zmm0
; WIN64: ret{{.*}}
; Test regcall when receiving/returning 512 bit vector
@@ -407,13 +413,13 @@ define x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i32> %a, <16 x i32>
; X32-LABEL: test_CallargRet512Vector:
; X32: vmov{{.*}} %zmm0, %zmm1
; X32: call{{.*}} {{.*}}test_argRet512Vector
-; X32: vpblend{{.*}} %zmm1, %zmm0, %zmm0
+; X32: movdqa{{.*}} %zmm1, %zmm0
; X32: ret{{.*}}
; WIN64-LABEL: test_CallargRet512Vector:
; WIN64: vmov{{.*}} %zmm0, %zmm1
; WIN64: call{{.*}} {{.*}}test_argRet512Vector
-; WIN64: vpblend{{.*}} %zmm1, %zmm0, %zmm0
+; WIN64: vmovdqa{{.*}} %zmm1, %zmm0
; WIN64: ret{{.*}}
; Test regcall when passing/retrieving 512 bit vector
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 840239b9011a..1991ee4f3376 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -218,8 +218,7 @@ define <16 x i32> @test_vbroadcast() {
; ALL: # BB#0: # %entry
; ALL-NEXT: vpxord %zmm0, %zmm0, %zmm0
; ALL-NEXT: vcmpunordps %zmm0, %zmm0, %k1
-; ALL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; ALL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; ALL-NEXT: knotw %k1, %k1
; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
; ALL-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index bd269ea87a35..361ee1ddbf9d 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -6,7 +6,8 @@ define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
; CHECK-LABEL: test1:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1
-; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%mask = fcmp ole <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
@@ -17,7 +18,8 @@ define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
; CHECK-LABEL: test2:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1
-; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
; CHECK-NEXT: retq
%mask = fcmp ole <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
@@ -28,7 +30,8 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin
; CHECK-LABEL: test3:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
-; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %yp, align 4
%mask = icmp eq <16 x i32> %x, %y
@@ -40,7 +43,8 @@ define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1)
; CHECK-LABEL: test4_unsigned:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1
-; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask = icmp uge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
@@ -51,7 +55,8 @@ define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
; CHECK-LABEL: test5:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
-; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask = icmp eq <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
@@ -62,7 +67,8 @@ define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) noun
; CHECK-LABEL: test6_unsigned:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
-; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask = icmp ugt <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
@@ -81,7 +87,8 @@ define <4 x float> @test7(<4 x float> %a, <4 x float> %b) {
; SKX: ## BB#0:
; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2
; SKX-NEXT: vcmpltps %xmm2, %xmm0, %k1
-; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
%mask = fcmp olt <4 x float> %a, zeroinitializer
@@ -101,7 +108,8 @@ define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
; SKX: ## BB#0:
; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; SKX-NEXT: vcmpltpd %xmm2, %xmm0, %k1
-; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; SKX-NEXT: vmovapd %xmm1, %xmm0
; SKX-NEXT: retq
%mask = fcmp olt <2 x double> %a, zeroinitializer
%c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b
@@ -114,14 +122,15 @@ define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
-; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; KNL-NEXT: vmovdqa %ymm1, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test9:
; SKX: ## BB#0:
; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
-; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; SKX-NEXT: vmovdqa %ymm1, %ymm0
; SKX-NEXT: retq
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
@@ -134,14 +143,15 @@ define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vcmpeqps %zmm1, %zmm0, %k1
-; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; KNL-NEXT: vmovaps %ymm1, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test10:
; SKX: ## BB#0:
; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1}
+; SKX-NEXT: vmovaps %ymm1, %ymm0
; SKX-NEXT: retq
%mask = fcmp oeq <8 x float> %x, %y
@@ -658,9 +668,9 @@ define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
; CHECK-LABEL: test14:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm2
+; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm2
; CHECK-NEXT: vpcmpgtd %zmm0, %zmm2, %k1
-; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%sub_r = sub <16 x i32> %a, %b
%cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a
@@ -673,9 +683,9 @@ define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
; CHECK-LABEL: test15:
; CHECK: ## BB#0:
-; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm2
+; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm2
; CHECK-NEXT: vpcmpgtq %zmm0, %zmm2, %k1
-; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
%sub_r = sub <8 x i64> %a, %b
%cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a
@@ -689,7 +699,8 @@ define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind
; CHECK-LABEL: test16:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1
-; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask = icmp sge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
@@ -700,7 +711,8 @@ define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
; CHECK-LABEL: test17:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
-; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sgt <16 x i32> %x, %y
@@ -712,7 +724,8 @@ define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
; CHECK-LABEL: test18:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1
-; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sle <16 x i32> %x, %y
@@ -724,7 +737,8 @@ define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
; CHECK-LABEL: test19:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1
-; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp ule <16 x i32> %x, %y
@@ -737,7 +751,8 @@ define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i3
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask1 = icmp eq <16 x i32> %x1, %y1
%mask0 = icmp eq <16 x i32> %x, %y
@@ -751,7 +766,8 @@ define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i64> %x1, %y1
%mask0 = icmp sle <8 x i64> %x, %y
@@ -765,7 +781,8 @@ define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i6
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtq %zmm2, %zmm1, %k1
; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask1 = icmp sgt <8 x i64> %x1, %y1
%y = load <8 x i64>, <8 x i64>* %y.ptr, align 4
@@ -780,7 +797,8 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i32> %x1, %y1
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
@@ -794,7 +812,8 @@ define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
; CHECK-LABEL: test24:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1
-; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
@@ -808,7 +827,8 @@ define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind
; CHECK-LABEL: test25:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1
-; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
@@ -823,7 +843,8 @@ define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1
; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
@@ -840,7 +861,8 @@ define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1
; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
@@ -858,8 +880,7 @@ define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1
; KNL-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1
; KNL-NEXT: kxnorw %k1, %k0, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: retq
;
@@ -883,8 +904,7 @@ define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32>
; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; KNL-NEXT: vpcmpgtd %zmm3, %zmm2, %k1
; KNL-NEXT: kxorw %k1, %k0, %k1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: retq
;
@@ -912,7 +932,8 @@ define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind {
; SKX-LABEL: test30:
; SKX: ## BB#0:
; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1}
+; SKX-NEXT: vmovapd %ymm1, %ymm0
; SKX-NEXT: retq
%mask = fcmp oeq <4 x double> %x, %y
@@ -930,7 +951,8 @@ define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp
; SKX-LABEL: test31:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1
-; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; SKX-NEXT: vmovapd %xmm1, %xmm0
; SKX-NEXT: retq
%y = load <2 x double>, <2 x double>* %yp, align 4
@@ -949,7 +971,8 @@ define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp
; SKX-LABEL: test32:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1}
+; SKX-NEXT: vmovapd %ymm1, %ymm0
; SKX-NEXT: retq
%y = load <4 x double>, <4 x double>* %yp, align 4
@@ -962,7 +985,8 @@ define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp
; CHECK-LABEL: test33:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1
-; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
; CHECK-NEXT: retq
%y = load <8 x double>, <8 x double>* %yp, align 4
%mask = fcmp olt <8 x double> %x, %y
@@ -980,7 +1004,8 @@ define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) no
; SKX-LABEL: test34:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1
-; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
%y = load <4 x float>, <4 x float>* %yp, align 4
%mask = fcmp olt <4 x float> %x, %y
@@ -995,14 +1020,15 @@ define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) no
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vmovups (%rdi), %ymm2
; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1
-; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; KNL-NEXT: vmovaps %ymm1, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test35:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1}
+; SKX-NEXT: vmovaps %ymm1, %ymm0
; SKX-NEXT: retq
%y = load <8 x float>, <8 x float>* %yp, align 4
@@ -1015,7 +1041,8 @@ define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp
; CHECK-LABEL: test36:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1
-; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%y = load <16 x float>, <16 x float>* %yp, align 4
%mask = fcmp olt <16 x float> %x, %y
@@ -1027,7 +1054,8 @@ define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nou
; CHECK-LABEL: test37:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1
-; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
; CHECK-NEXT: retq
%a = load double, double* %ptr
@@ -1050,7 +1078,8 @@ define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, double* %ptr) nou
; SKX-LABEL: test38:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1
-; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: vmovapd %ymm0, %ymm1 {%k1}
+; SKX-NEXT: vmovapd %ymm1, %ymm0
; SKX-NEXT: retq
%a = load double, double* %ptr
@@ -1073,7 +1102,8 @@ define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nou
; SKX-LABEL: test39:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1
-; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; SKX-NEXT: vmovapd %xmm1, %xmm0
; SKX-NEXT: retq
%a = load double, double* %ptr
@@ -1090,7 +1120,8 @@ define <16 x float> @test40(<16 x float> %x, <16 x float> %x1, float* %ptr) n
; CHECK-LABEL: test40:
; CHECK: ## BB#0:
; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1
-; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%a = load float, float* %ptr
@@ -1109,14 +1140,15 @@ define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, float* %ptr) noun
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; KNL-NEXT: vbroadcastss (%rdi), %ymm2
; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1
-; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; KNL-NEXT: vmovaps %ymm1, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test41:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1
-; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT: vmovaps %ymm0, %ymm1 {%k1}
+; SKX-NEXT: vmovaps %ymm1, %ymm0
; SKX-NEXT: retq
%a = load float, float* %ptr
@@ -1139,7 +1171,8 @@ define <4 x float> @test42(<4 x float> %x, <4 x float> %x1, float* %ptr) noun
; SKX-LABEL: test42:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1
-; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; SKX-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
; SKX-NEXT: retq
%a = load float, float* %ptr
@@ -1158,7 +1191,8 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x
; KNL-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
-; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; KNL-NEXT: vmovapd %zmm0, %zmm1 {%k1}
+; KNL-NEXT: vmovapd %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test43:
@@ -1166,7 +1200,8 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x
; SKX-NEXT: vpsllw $15, %xmm2, %xmm2
; SKX-NEXT: vpmovw2m %xmm2, %k1
; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
-; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; SKX-NEXT: vmovapd %zmm0, %zmm1 {%k1}
+; SKX-NEXT: vmovapd %zmm1, %zmm0
; SKX-NEXT: retq
%a = load double, double* %ptr
diff --git a/test/CodeGen/X86/avx512bw-mov.ll b/test/CodeGen/X86/avx512bw-mov.ll
index c58b3cc8c3cd..11bb431414a0 100644
--- a/test/CodeGen/X86/avx512bw-mov.ll
+++ b/test/CodeGen/X86/avx512bw-mov.ll
@@ -26,7 +26,7 @@ define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
; CHECK-NEXT: vpcmpneqb %zmm2, %zmm1, %k1
-; CHECK-NEXT: vpblendmb (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ne <64 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <64 x i8>*
@@ -74,7 +74,7 @@ define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
; CHECK-NEXT: vpcmpneqw %zmm2, %zmm1, %k1
-; CHECK-NEXT: vpblendmw (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
%mask = icmp ne <32 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <32 x i16>*
diff --git a/test/CodeGen/X86/avx512bw-vec-cmp.ll b/test/CodeGen/X86/avx512bw-vec-cmp.ll
index 016837e61307..34432468921b 100644
--- a/test/CodeGen/X86/avx512bw-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512bw-vec-cmp.ll
@@ -5,7 +5,8 @@ define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind {
; CHECK-LABEL: test1:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
-; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask = icmp eq <64 x i8> %x, %y
%max = select <64 x i1> %mask, <64 x i8> %x, <64 x i8> %y
@@ -16,7 +17,8 @@ define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
; CHECK-LABEL: test2:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
-; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask = icmp sgt <64 x i8> %x, %y
%max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y
@@ -27,7 +29,8 @@ define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind
; CHECK-LABEL: test3:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k1
-; CHECK-NEXT: vpblendmw %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask = icmp sge <32 x i16> %x, %y
%max = select <32 x i1> %mask, <32 x i16> %x1, <32 x i16> %y
@@ -38,7 +41,8 @@ define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
; CHECK-LABEL: test4:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1
-; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask = icmp ugt <64 x i8> %x, %y
%max = select <64 x i1> %mask, <64 x i8> %x1, <64 x i8> %y
@@ -49,7 +53,8 @@ define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwin
; CHECK-LABEL: test5:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k1
-; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %yp, align 4
%mask = icmp eq <32 x i16> %x, %y
@@ -61,7 +66,8 @@ define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun
; CHECK-LABEL: test6:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k1
-; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask = icmp sgt <32 x i16> %x, %y
@@ -73,7 +79,8 @@ define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun
; CHECK-LABEL: test7:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew (%rdi), %zmm0, %k1
-; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask = icmp sle <32 x i16> %x, %y
@@ -85,7 +92,8 @@ define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun
; CHECK-LABEL: test8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1
-; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
%mask = icmp ule <32 x i16> %x, %y
@@ -98,7 +106,8 @@ define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 {%k1}
-; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask1 = icmp eq <32 x i16> %x1, %y1
%mask0 = icmp eq <32 x i16> %x, %y
@@ -112,7 +121,8 @@ define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleb %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpleb %zmm2, %zmm3, %k1 {%k1}
-; CHECK-NEXT: vpblendmb %zmm0, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
; CHECK-NEXT: retq
%mask1 = icmp sge <64 x i8> %x1, %y1
%mask0 = icmp sle <64 x i8> %x, %y
@@ -126,7 +136,8 @@ define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtb %zmm2, %zmm1, %k1
; CHECK-NEXT: vpcmpgtb (%rdi), %zmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask1 = icmp sgt <64 x i8> %x1, %y1
%y = load <64 x i8>, <64 x i8>* %y.ptr, align 4
@@ -141,7 +152,8 @@ define <32 x i16> @test12(<32 x i16> %x, <32 x i16>* %y.ptr, <32 x i16> %x1, <32
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew %zmm1, %zmm2, %k1
; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%mask1 = icmp sge <32 x i16> %x1, %y1
%y = load <32 x i16>, <32 x i16>* %y.ptr, align 4
diff --git a/test/CodeGen/X86/avx512bwvl-mov.ll b/test/CodeGen/X86/avx512bwvl-mov.ll
index 209f18ba7f9c..3f92641a3e16 100644
--- a/test/CodeGen/X86/avx512bwvl-mov.ll
+++ b/test/CodeGen/X86/avx512bwvl-mov.ll
@@ -26,7 +26,7 @@ define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
; CHECK-NEXT: vpcmpneqb %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x3f,0xca,0x04]
-; CHECK-NEXT: vpblendmb (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x66,0x07]
+; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <32 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <32 x i8>*
@@ -74,7 +74,7 @@ define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
; CHECK-NEXT: vpcmpneqw %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x3f,0xca,0x04]
-; CHECK-NEXT: vpblendmw (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x66,0x07]
+; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i16>*
@@ -122,7 +122,7 @@ define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x3f,0xca,0x04]
-; CHECK-NEXT: vpblendmb (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x66,0x07]
+; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <16 x i8> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <16 x i8>*
@@ -170,7 +170,7 @@ define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqw %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x3f,0xca,0x04]
-; CHECK-NEXT: vpblendmw (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x66,0x07]
+; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i16> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i16>*
diff --git a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
index 17e581bbb501..3e7f0acae78b 100644
--- a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
@@ -5,7 +5,8 @@ define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind {
; CHECK-LABEL: test256_1:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask = icmp eq <32 x i8> %x, %y
%max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %y
@@ -16,7 +17,8 @@ define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind
; CHECK-LABEL: test256_2:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%mask = icmp sgt <32 x i8> %x, %y
%max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
@@ -27,7 +29,8 @@ define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounw
; CHECK-LABEL: test256_3:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k1
-; CHECK-NEXT: vpblendmw %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask = icmp sge <16 x i16> %x, %y
%max = select <16 x i1> %mask, <16 x i16> %x1, <16 x i16> %y
@@ -38,7 +41,8 @@ define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind
; CHECK-LABEL: test256_4:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%mask = icmp ugt <32 x i8> %x, %y
%max = select <32 x i1> %mask, <32 x i8> %x, <32 x i8> %x1
@@ -49,7 +53,8 @@ define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nou
; CHECK-LABEL: test256_5:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %yp, align 4
%mask = icmp eq <16 x i16> %x, %y
@@ -61,7 +66,8 @@ define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr)
; CHECK-LABEL: test256_6:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask = icmp sgt <16 x i16> %x, %y
@@ -73,7 +79,8 @@ define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr)
; CHECK-LABEL: test256_7:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask = icmp sle <16 x i16> %x, %y
@@ -85,7 +92,8 @@ define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr)
; CHECK-LABEL: test256_8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
%mask = icmp ule <16 x i16> %x, %y
@@ -98,7 +106,8 @@ define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 {%k1}
-; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask1 = icmp eq <16 x i16> %x1, %y1
%mask0 = icmp eq <16 x i16> %x, %y
@@ -112,7 +121,8 @@ define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k1
; CHECK-NEXT: vpcmpleb %ymm2, %ymm3, %k1 {%k1}
-; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%mask1 = icmp sge <32 x i8> %x1, %y1
%mask0 = icmp sle <32 x i8> %x, %y
@@ -126,7 +136,8 @@ define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtb %ymm2, %ymm1, %k1
; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask1 = icmp sgt <32 x i8> %x1, %y1
%y = load <32 x i8>, <32 x i8>* %y.ptr, align 4
@@ -141,7 +152,8 @@ define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1,
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew %ymm1, %ymm2, %k1
; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i16> %x1, %y1
%y = load <16 x i16>, <16 x i16>* %y.ptr, align 4
@@ -155,7 +167,8 @@ define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind {
; CHECK-LABEL: test128_1:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask = icmp eq <16 x i8> %x, %y
%max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %y
@@ -166,7 +179,8 @@ define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind
; CHECK-LABEL: test128_2:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
; CHECK-NEXT: retq
%mask = icmp sgt <16 x i8> %x, %y
%max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
@@ -177,7 +191,8 @@ define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind
; CHECK-LABEL: test128_3:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k1
-; CHECK-NEXT: vpblendmw %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %xmm2, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask = icmp sge <8 x i16> %x, %y
%max = select <8 x i1> %mask, <8 x i16> %x1, <8 x i16> %y
@@ -188,7 +203,8 @@ define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind
; CHECK-LABEL: test128_4:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
; CHECK-NEXT: retq
%mask = icmp ugt <16 x i8> %x, %y
%max = select <16 x i1> %mask, <16 x i8> %x, <16 x i8> %x1
@@ -199,7 +215,8 @@ define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwin
; CHECK-LABEL: test128_5:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %yp, align 4
%mask = icmp eq <8 x i16> %x, %y
@@ -211,7 +228,8 @@ define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun
; CHECK-LABEL: test128_6:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask = icmp sgt <8 x i16> %x, %y
@@ -223,7 +241,8 @@ define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun
; CHECK-LABEL: test128_7:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask = icmp sle <8 x i16> %x, %y
@@ -235,7 +254,8 @@ define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun
; CHECK-LABEL: test128_8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
%mask = icmp ule <8 x i16> %x, %y
@@ -248,7 +268,8 @@ define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16>
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 {%k1}
-; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask1 = icmp eq <8 x i16> %x1, %y1
%mask0 = icmp eq <8 x i16> %x, %y
@@ -262,7 +283,8 @@ define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k1
; CHECK-NEXT: vpcmpleb %xmm2, %xmm3, %k1 {%k1}
-; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
; CHECK-NEXT: retq
%mask1 = icmp sge <16 x i8> %x1, %y1
%mask0 = icmp sle <16 x i8> %x, %y
@@ -276,7 +298,8 @@ define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtb %xmm2, %xmm1, %k1
; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask1 = icmp sgt <16 x i8> %x1, %y1
%y = load <16 x i8>, <16 x i8>* %y.ptr, align 4
@@ -291,7 +314,8 @@ define <8 x i16> @test128_12(<8 x i16> %x, <8 x i16>* %y.ptr, <8 x i16> %x1, <8
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmplew %xmm1, %xmm2, %k1
; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i16> %x1, %y1
%y = load <8 x i16>, <8 x i16>* %y.ptr, align 4
diff --git a/test/CodeGen/X86/avx512vl-mov.ll b/test/CodeGen/X86/avx512vl-mov.ll
index e37fd76377e3..af449d6628c4 100644
--- a/test/CodeGen/X86/avx512vl-mov.ll
+++ b/test/CodeGen/X86/avx512vl-mov.ll
@@ -166,7 +166,7 @@ define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
-; CHECK-NEXT: vpblendmd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x64,0x07]
+; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i32>*
@@ -180,7 +180,7 @@ define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
-; CHECK-NEXT: vpblendmd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x64,0x07]
+; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <8 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x i32>*
@@ -222,7 +222,7 @@ define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
-; CHECK-NEXT: vpblendmq (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x64,0x07]
+; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i64>*
@@ -236,7 +236,7 @@ define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
-; CHECK-NEXT: vpblendmq (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x64,0x07]
+; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i64>*
@@ -279,7 +279,7 @@ define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
; CHECK-NEXT: vcmpordps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x07]
; CHECK-NEXT: vcmpneqps %ymm2, %ymm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0xc2,0xca,0x04]
-; CHECK-NEXT: vblendmps (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x65,0x07]
+; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x float>*
@@ -294,7 +294,7 @@ define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
; CHECK-NEXT: vcmpordps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x07]
; CHECK-NEXT: vcmpneqps %ymm2, %ymm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0xc2,0xca,0x04]
-; CHECK-NEXT: vblendmps (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x65,0x07]
+; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <8 x float>*
@@ -338,7 +338,7 @@ define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
-; CHECK-NEXT: vblendmpd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x65,0x07]
+; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x double>*
@@ -352,7 +352,7 @@ define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
-; CHECK-NEXT: vblendmpd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x65,0x07]
+; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x double>*
@@ -554,7 +554,7 @@ define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT: vpblendmd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x64,0x07]
+; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i32>*
@@ -568,7 +568,7 @@ define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT: vpblendmd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x64,0x07]
+; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x i32>*
@@ -610,7 +610,7 @@ define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT: vpblendmq (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x64,0x07]
+; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x i64>*
@@ -624,7 +624,7 @@ define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT: vpblendmq (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x64,0x07]
+; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x i64>*
@@ -666,7 +666,7 @@ define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1)
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT: vblendmps (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x65,0x07]
+; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x float>*
@@ -680,7 +680,7 @@ define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1)
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT: vblendmps (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x65,0x07]
+; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <4 x i32> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <4 x float>*
@@ -722,7 +722,7 @@ define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT: vblendmpd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x65,0x07]
+; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x double>*
@@ -736,7 +736,7 @@ define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1
; CHECK: ## BB#0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
-; CHECK-NEXT: vblendmpd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x65,0x07]
+; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = icmp ne <2 x i64> %mask1, zeroinitializer
%vaddr = bitcast i8* %addr to <2 x double>*
diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll
index e0acf2be653e..25b9cc79096f 100644
--- a/test/CodeGen/X86/avx512vl-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll
@@ -5,7 +5,8 @@ define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind {
; CHECK-LABEL: test256_1:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask = icmp eq <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> %y
@@ -16,7 +17,8 @@ define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind
; CHECK-LABEL: test256_2:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask = icmp sgt <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
@@ -27,7 +29,8 @@ define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind
; CHECK-LABEL: test256_3:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k1
-; CHECK-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm2, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask = icmp sge <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y
@@ -38,7 +41,8 @@ define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind
; CHECK-LABEL: test256_4:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1
-; CHECK-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %ymm2, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask = icmp ugt <4 x i64> %x, %y
%max = select <4 x i1> %mask, <4 x i64> %x1, <4 x i64> %y
@@ -49,7 +53,8 @@ define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwin
; CHECK-LABEL: test256_5:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp eq <8 x i32> %x, %y
@@ -61,7 +66,8 @@ define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
; CHECK-LABEL: test256_5b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp eq <8 x i32> %y, %x
@@ -73,7 +79,8 @@ define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
; CHECK-LABEL: test256_6:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sgt <8 x i32> %x, %y
@@ -85,7 +92,8 @@ define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
; CHECK-LABEL: test256_6b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp slt <8 x i32> %y, %x
@@ -97,7 +105,8 @@ define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
; CHECK-LABEL: test256_7:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sle <8 x i32> %x, %y
@@ -109,7 +118,8 @@ define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
; CHECK-LABEL: test256_7b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sge <8 x i32> %y, %x
@@ -121,7 +131,8 @@ define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
; CHECK-LABEL: test256_8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp ule <8 x i32> %x, %y
@@ -133,7 +144,8 @@ define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
; CHECK-LABEL: test256_8b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp uge <8 x i32> %y, %x
@@ -146,7 +158,8 @@ define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32>
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask1 = icmp eq <8 x i32> %x1, %y1
%mask0 = icmp eq <8 x i32> %x, %y
@@ -160,7 +173,8 @@ define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k1
; CHECK-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i64> %x1, %y1
%mask0 = icmp sle <4 x i64> %x, %y
@@ -174,7 +188,8 @@ define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtq %ymm2, %ymm1, %k1
; CHECK-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask1 = icmp sgt <4 x i64> %x1, %y1
%y = load <4 x i64>, <4 x i64>* %y.ptr, align 4
@@ -189,7 +204,8 @@ define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1
; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
@@ -203,7 +219,8 @@ define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind
; CHECK-LABEL: test256_13:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <4 x i64> undef, i64 %yb, i32 0
@@ -217,7 +234,8 @@ define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind
; CHECK-LABEL: test256_14:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
@@ -232,7 +250,8 @@ define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %ymm1, %ymm2, %k1
; CHECK-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
@@ -249,7 +268,8 @@ define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleq %ymm1, %ymm2, %k1
; CHECK-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
@@ -265,7 +285,8 @@ define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
; CHECK-LABEL: test256_17:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp ne <8 x i32> %x, %y
@@ -277,7 +298,8 @@ define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
; CHECK-LABEL: test256_18:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp ne <8 x i32> %y, %x
@@ -289,7 +311,8 @@ define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
; CHECK-LABEL: test256_19:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnltud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp uge <8 x i32> %x, %y
@@ -301,7 +324,8 @@ define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
; CHECK-LABEL: test256_20:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %ymm0, %k1
-; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp uge <8 x i32> %y, %x
@@ -313,7 +337,8 @@ define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind {
; CHECK-LABEL: test128_1:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask = icmp eq <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> %y
@@ -324,7 +349,8 @@ define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind
; CHECK-LABEL: test128_2:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask = icmp sgt <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
@@ -335,7 +361,8 @@ define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind
; CHECK-LABEL: test128_3:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k1
-; CHECK-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask = icmp sge <4 x i32> %x, %y
%max = select <4 x i1> %mask, <4 x i32> %x1, <4 x i32> %y
@@ -346,7 +373,8 @@ define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind
; CHECK-LABEL: test128_4:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
-; CHECK-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask = icmp ugt <2 x i64> %x, %y
%max = select <2 x i1> %mask, <2 x i64> %x1, <2 x i64> %y
@@ -357,7 +385,8 @@ define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwin
; CHECK-LABEL: test128_5:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %yp, align 4
%mask = icmp eq <4 x i32> %x, %y
@@ -369,7 +398,8 @@ define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwi
; CHECK-LABEL: test128_5b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %yp, align 4
%mask = icmp eq <4 x i32> %y, %x
@@ -381,7 +411,8 @@ define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
; CHECK-LABEL: test128_6:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sgt <4 x i32> %x, %y
@@ -393,7 +424,8 @@ define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
; CHECK-LABEL: test128_6b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp slt <4 x i32> %y, %x
@@ -405,7 +437,8 @@ define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
; CHECK-LABEL: test128_7:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sle <4 x i32> %x, %y
@@ -417,7 +450,8 @@ define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
; CHECK-LABEL: test128_7b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp sge <4 x i32> %y, %x
@@ -429,7 +463,8 @@ define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
; CHECK-LABEL: test128_8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ule <4 x i32> %x, %y
@@ -441,7 +476,8 @@ define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
; CHECK-LABEL: test128_8b:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp uge <4 x i32> %y, %x
@@ -454,7 +490,8 @@ define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32>
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask1 = icmp eq <4 x i32> %x1, %y1
%mask0 = icmp eq <4 x i32> %x, %y
@@ -468,7 +505,8 @@ define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k1
; CHECK-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
; CHECK-NEXT: retq
%mask1 = icmp sge <2 x i64> %x1, %y1
%mask0 = icmp sle <2 x i64> %x, %y
@@ -482,7 +520,8 @@ define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtq %xmm2, %xmm1, %k1
; CHECK-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask1 = icmp sgt <2 x i64> %x1, %y1
%y = load <2 x i64>, <2 x i64>* %y.ptr, align 4
@@ -497,7 +536,8 @@ define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1
; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i32> %x1, %y1
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
@@ -511,7 +551,8 @@ define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind
; CHECK-LABEL: test128_13:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <2 x i64> undef, i64 %yb, i32 0
@@ -525,7 +566,8 @@ define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind
; CHECK-LABEL: test128_14:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <4 x i32> undef, i32 %yb, i32 0
@@ -540,7 +582,8 @@ define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpled %xmm1, %xmm2, %k1
; CHECK-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask1 = icmp sge <4 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
@@ -557,7 +600,8 @@ define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleq %xmm1, %xmm2, %k1
; CHECK-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1}
-; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%mask1 = icmp sge <2 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
@@ -573,7 +617,8 @@ define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
; CHECK-LABEL: test128_17:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ne <4 x i32> %x, %y
@@ -585,7 +630,8 @@ define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
; CHECK-LABEL: test128_18:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp ne <4 x i32> %y, %x
@@ -597,7 +643,8 @@ define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
; CHECK-LABEL: test128_19:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpnltud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp uge <4 x i32> %x, %y
@@ -609,7 +656,8 @@ define <4 x i32> @test128_20(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
; CHECK-LABEL: test128_20:
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpleud (%rdi), %xmm0, %k1
-; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%y = load <4 x i32>, <4 x i32>* %y.ptr, align 4
%mask = icmp uge <4 x i32> %y, %x
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index 8e9bc8b5af4b..0060539c691f 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -157,16 +157,12 @@ define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind {
ret i8 %d
}
-; FIXME: The 'not' is redundant.
-
define i32 @smin(i32 %x) {
; CHECK-LABEL: smin:
; CHECK: ## BB#0:
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: notl %ecx
; CHECK-NEXT: xorl $-1, %edi
; CHECK-NEXT: movl $-1, %eax
-; CHECK-NEXT: cmovsl %ecx, %eax
+; CHECK-NEXT: cmovsl %edi, %eax
; CHECK-NEXT: retq
%not_x = xor i32 %x, -1
%1 = icmp slt i32 %not_x, -1
diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll
index 5636a5bcd73e..5329f5b216a4 100644
--- a/test/CodeGen/X86/fma-fneg-combine.ll
+++ b/test/CodeGen/X86/fma-fneg-combine.ll
@@ -222,9 +222,9 @@ define <16 x float> @test15(<16 x float> %a, <16 x float> %b, <16 x float> %c, i
; SKX-NEXT: kmovw %edi, %k1
; SKX-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm3
; SKX-NEXT: vfnmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
-; SKX-NEXT: vblendmps %zmm1, %zmm3, %zmm1 {%k1}
-; SKX-NEXT: vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm1 {%k1}
-; SKX-NEXT: vmovaps %zmm1, %zmm0
+; SKX-NEXT: vmovaps %zmm1, %zmm3 {%k1}
+; SKX-NEXT: vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm3 {%k1}
+; SKX-NEXT: vmovaps %zmm3, %zmm0
; SKX-NEXT: retq
;
; KNL-LABEL: test15:
@@ -232,9 +232,9 @@ define <16 x float> @test15(<16 x float> %a, <16 x float> %b, <16 x float> %c, i
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm3
; KNL-NEXT: vfnmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
-; KNL-NEXT: vblendmps %zmm1, %zmm3, %zmm1 {%k1}
-; KNL-NEXT: vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm1 {%k1}
-; KNL-NEXT: vmovaps %zmm1, %zmm0
+; KNL-NEXT: vmovaps %zmm1, %zmm3 {%k1}
+; KNL-NEXT: vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm3 {%k1}
+; KNL-NEXT: vmovaps %zmm3, %zmm0
; KNL-NEXT: retq
entry:
%sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
diff --git a/test/CodeGen/X86/fmaddsub-combine.ll b/test/CodeGen/X86/fmaddsub-combine.ll
new file mode 100644
index 000000000000..f3b13cd053b4
--- /dev/null
+++ b/test/CodeGen/X86/fmaddsub-combine.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_256 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_512 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck -check-prefix=FMA4 %s
+
+; This test checks the fusing of MUL + ADDSUB to FMADDSUB.
+
+define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
+; FMA3-LABEL: mul_addsub_pd128:
+; FMA3: # BB#0: # %entry
+; FMA3-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
+; FMA3-NEXT: retq
+;
+; FMA4-LABEL: mul_addsub_pd128:
+; FMA4: # BB#0: # %entry
+; FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %AB = fmul <2 x double> %A, %B
+ %Sub = fsub <2 x double> %AB, %C
+ %Add = fadd <2 x double> %AB, %C
+ %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %Addsub
+}
+
+define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
+; FMA3-LABEL: mul_addsub_ps128:
+; FMA3: # BB#0: # %entry
+; FMA3-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
+; FMA3-NEXT: retq
+;
+; FMA4-LABEL: mul_addsub_ps128:
+; FMA4: # BB#0: # %entry
+; FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+entry:
+ %AB = fmul <4 x float> %A, %B
+ %Sub = fsub <4 x float> %AB, %C
+ %Add = fadd <4 x float> %AB, %C
+ %Addsub = shufflevector <4 x float> %Sub, <4 x float> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+ ret <4 x float> %Addsub
+}
+
+define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
+; FMA3-LABEL: mul_addsub_pd256:
+; FMA3: # BB#0: # %entry
+; FMA3-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
+; FMA3-NEXT: retq
+;
+; FMA4-LABEL: mul_addsub_pd256:
+; FMA4: # BB#0: # %entry
+; FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT: retq
+entry:
+ %AB = fmul <4 x double> %A, %B
+ %Sub = fsub <4 x double> %AB, %C
+ %Add = fadd <4 x double> %AB, %C
+ %Addsub = shufflevector <4 x double> %Sub, <4 x double> %Add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+ ret <4 x double> %Addsub
+}
+
+define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
+; FMA3-LABEL: mul_addsub_ps256:
+; FMA3: # BB#0: # %entry
+; FMA3-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
+; FMA3-NEXT: retq
+;
+; FMA4-LABEL: mul_addsub_ps256:
+; FMA4: # BB#0: # %entry
+; FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT: retq
+entry:
+ %AB = fmul <8 x float> %A, %B
+ %Sub = fsub <8 x float> %AB, %C
+ %Add = fadd <8 x float> %AB, %C
+ %Addsub = shufflevector <8 x float> %Sub, <8 x float> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+ ret <8 x float> %Addsub
+}
+
+define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
+; FMA3_256-LABEL: mul_addsub_pd512:
+; FMA3_256: # BB#0: # %entry
+; FMA3_256-NEXT: vfmaddsub213pd %ymm4, %ymm2, %ymm0
+; FMA3_256-NEXT: vfmaddsub213pd %ymm5, %ymm3, %ymm1
+; FMA3_256-NEXT: retq
+;
+; FMA3_512-LABEL: mul_addsub_pd512:
+; FMA3_512: # BB#0: # %entry
+; FMA3_512-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0
+; FMA3_512-NEXT: retq
+;
+; FMA4-LABEL: mul_addsub_pd512:
+; FMA4: # BB#0: # %entry
+; FMA4-NEXT: vfmaddsubpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT: vfmaddsubpd %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT: retq
+entry:
+ %AB = fmul <8 x double> %A, %B
+ %Sub = fsub <8 x double> %AB, %C
+ %Add = fadd <8 x double> %AB, %C
+ %Addsub = shufflevector <8 x double> %Sub, <8 x double> %Add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+ ret <8 x double> %Addsub
+}
+
+define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
+; FMA3_256-LABEL: mul_addsub_ps512:
+; FMA3_256: # BB#0: # %entry
+; FMA3_256-NEXT: vfmaddsub213ps %ymm4, %ymm2, %ymm0
+; FMA3_256-NEXT: vfmaddsub213ps %ymm5, %ymm3, %ymm1
+; FMA3_256-NEXT: retq
+;
+; FMA3_512-LABEL: mul_addsub_ps512:
+; FMA3_512: # BB#0: # %entry
+; FMA3_512-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0
+; FMA3_512-NEXT: retq
+;
+; FMA4-LABEL: mul_addsub_ps512:
+; FMA4: # BB#0: # %entry
+; FMA4-NEXT: vfmaddsubps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT: vfmaddsubps %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT: retq
+entry:
+ %AB = fmul <16 x float> %A, %B
+ %Sub = fsub <16 x float> %AB, %C
+ %Add = fadd <16 x float> %AB, %C
+ %Addsub = shufflevector <16 x float> %Sub, <16 x float> %Add, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+ ret <16 x float> %Addsub
+}
+
+attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/sse-fsignum.ll b/test/CodeGen/X86/sse-fsignum.ll
index 7159d4c87174..32594a27698d 100644
--- a/test/CodeGen/X86/sse-fsignum.ll
+++ b/test/CodeGen/X86/sse-fsignum.ll
@@ -93,15 +93,14 @@ define void @signum32b(<8 x float>*) {
; AVX512F-NEXT: vmovaps (%rdi), %ymm0
; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k1
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} {z}
-; AVX512F-NEXT: vpmovqd %zmm3, %ymm3
-; AVX512F-NEXT: vcvtdq2ps %ymm3, %ymm3
+; AVX512F-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpmovqd %zmm2, %ymm2
+; AVX512F-NEXT: vcvtdq2ps %ymm2, %ymm2
; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512F-NEXT: vsubps %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vsubps %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vmovaps %ymm0, (%rdi)
; AVX512F-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll
index abe3da752874..c34f333ef785 100644
--- a/test/CodeGen/X86/vector-compare-results.ll
+++ b/test/CodeGen/X86/vector-compare-results.ll
@@ -4,6 +4,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
;
; 128-bit vector comparisons
@@ -308,12 +310,26 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cmp_v16i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cmp_v16i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_cmp_v16i16:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: test_cmp_v16i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: retq
%1 = icmp sgt <16 x i16> %a0, %a1
ret <16 x i1> %1
}
@@ -589,13 +605,26 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cmp_v8f64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcmpltpd %zmm0, %zmm1, %k1
-; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cmp_v8f64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_cmp_v8f64:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0
+; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: test_cmp_v8f64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1
+; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: retq
%1 = fcmp ogt <8 x double> %a0, %a1
ret <8 x i1> %1
}
@@ -636,13 +665,26 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cmp_v16f32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcmpltps %zmm0, %zmm1, %k1
-; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cmp_v16f32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_cmp_v16f32:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: test_cmp_v16f32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k1
+; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: retq
%1 = fcmp ogt <16 x float> %a0, %a1
ret <16 x i1> %1
}
@@ -734,13 +776,26 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cmp_v8i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
-; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cmp_v8i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_cmp_v8i64:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: test_cmp_v8i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
+; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: retq
%1 = icmp sgt <8 x i64> %a0, %a1
ret <8 x i1> %1
}
@@ -784,13 +839,26 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cmp_v16i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
-; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cmp_v16i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_cmp_v16i32:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: test_cmp_v16i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: retq
%1 = icmp sgt <16 x i32> %a0, %a1
ret <16 x i1> %1
}
@@ -1045,16 +1113,35 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cmp_v32i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
-; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
-; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1
-; AVX512-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cmp_v32i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_cmp_v32i16:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: test_cmp_v32i16:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
+; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: retq
%1 = icmp sgt <32 x i16> %a0, %a1
ret <32 x i1> %1
}
@@ -1874,15 +1961,31 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cmp_v64i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
-; AVX512-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm3
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512-NEXT: vmovdqa %xmm4, %xmm2
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cmp_v64i8:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm3
+; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vmovdqa %xmm4, %xmm2
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_cmp_v64i8:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
+; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm3
+; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm2
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: test_cmp_v64i8:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: retq
%1 = icmp sgt <64 x i8> %a0, %a1
ret <64 x i1> %1
}
@@ -1957,120 +2060,350 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cmp_v16f64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm4
-; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm5
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: vucomisd %xmm4, %xmm5
-; AVX512-NEXT: movq $-1, %rcx
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm6
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512-NEXT: vucomisd %xmm4, %xmm5
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm4
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm5
-; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm6
-; AVX512-NEXT: vucomisd %xmm5, %xmm6
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm7
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512-NEXT: vucomisd %xmm5, %xmm6
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm5
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512-NEXT: vextractf32x4 $1, %zmm2, %xmm5
-; AVX512-NEXT: vextractf32x4 $1, %zmm0, %xmm6
-; AVX512-NEXT: vucomisd %xmm5, %xmm6
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm7
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512-NEXT: vucomisd %xmm5, %xmm6
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm5
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512-NEXT: vucomisd %xmm2, %xmm0
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm6
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512-NEXT: vucomisd %xmm2, %xmm0
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vextractf32x4 $3, %zmm3, %xmm2
-; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm4
-; AVX512-NEXT: vucomisd %xmm2, %xmm4
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm5
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT: vucomisd %xmm2, %xmm4
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm2
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm5
-; AVX512-NEXT: vucomisd %xmm4, %xmm5
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm6
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512-NEXT: vucomisd %xmm4, %xmm5
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm4
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512-NEXT: vextractf32x4 $1, %zmm3, %xmm4
-; AVX512-NEXT: vextractf32x4 $1, %zmm1, %xmm5
-; AVX512-NEXT: vucomisd %xmm4, %xmm5
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm6
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512-NEXT: vucomisd %xmm4, %xmm5
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm4
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512-NEXT: vucomisd %xmm3, %xmm1
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovaq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm5
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512-NEXT: vucomisd %xmm3, %xmm1
-; AVX512-NEXT: cmovaq %rcx, %rax
-; AVX512-NEXT: vmovq %rax, %xmm1
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cmp_v16f64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vextractf32x4 $3, %zmm2, %xmm4
+; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm5
+; AVX512F-NEXT: xorl %eax, %eax
+; AVX512F-NEXT: vucomisd %xmm4, %xmm5
+; AVX512F-NEXT: movq $-1, %rcx
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm6
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512F-NEXT: vucomisd %xmm4, %xmm5
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm4
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512F-NEXT: vextractf32x4 $2, %zmm2, %xmm5
+; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm6
+; AVX512F-NEXT: vucomisd %xmm5, %xmm6
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm7
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512F-NEXT: vucomisd %xmm5, %xmm6
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm5
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT: vextractf32x4 $1, %zmm2, %xmm5
+; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm6
+; AVX512F-NEXT: vucomisd %xmm5, %xmm6
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm7
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512F-NEXT: vucomisd %xmm5, %xmm6
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm5
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512F-NEXT: vucomisd %xmm2, %xmm0
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm6
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vucomisd %xmm2, %xmm0
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm0
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vextractf32x4 $3, %zmm3, %xmm2
+; AVX512F-NEXT: vextractf32x4 $3, %zmm1, %xmm4
+; AVX512F-NEXT: vucomisd %xmm2, %xmm4
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm5
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT: vucomisd %xmm2, %xmm4
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512F-NEXT: vextractf32x4 $2, %zmm3, %xmm4
+; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm5
+; AVX512F-NEXT: vucomisd %xmm4, %xmm5
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm6
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512F-NEXT: vucomisd %xmm4, %xmm5
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm4
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT: vextractf32x4 $1, %zmm3, %xmm4
+; AVX512F-NEXT: vextractf32x4 $1, %zmm1, %xmm5
+; AVX512F-NEXT: vucomisd %xmm4, %xmm5
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm6
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512F-NEXT: vucomisd %xmm4, %xmm5
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm4
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512F-NEXT: vucomisd %xmm3, %xmm1
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovaq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm5
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512F-NEXT: vucomisd %xmm3, %xmm1
+; AVX512F-NEXT: cmovaq %rcx, %rax
+; AVX512F-NEXT: vmovq %rax, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_cmp_v16f64:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vextractf64x2 $3, %zmm2, %xmm4
+; AVX512DQ-NEXT: vextractf64x2 $3, %zmm0, %xmm5
+; AVX512DQ-NEXT: xorl %eax, %eax
+; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT: movq $-1, %rcx
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm6
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm4
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512DQ-NEXT: vextractf64x2 $2, %zmm2, %xmm5
+; AVX512DQ-NEXT: vextractf64x2 $2, %zmm0, %xmm6
+; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm7
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm5
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vextractf64x2 $1, %zmm2, %xmm5
+; AVX512DQ-NEXT: vextractf64x2 $1, %zmm0, %xmm6
+; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm7
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm5
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512DQ-NEXT: vucomisd %xmm2, %xmm0
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm6
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512DQ-NEXT: vucomisd %xmm2, %xmm0
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm0
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT: vextractf64x2 $3, %zmm3, %xmm2
+; AVX512DQ-NEXT: vextractf64x2 $3, %zmm1, %xmm4
+; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm5
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm2
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512DQ-NEXT: vextractf64x2 $2, %zmm3, %xmm4
+; AVX512DQ-NEXT: vextractf64x2 $2, %zmm1, %xmm5
+; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm6
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm4
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512DQ-NEXT: vextractf64x2 $1, %zmm3, %xmm4
+; AVX512DQ-NEXT: vextractf64x2 $1, %zmm1, %xmm5
+; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm6
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm4
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512DQ-NEXT: vucomisd %xmm3, %xmm1
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovaq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm5
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512DQ-NEXT: vucomisd %xmm3, %xmm1
+; AVX512DQ-NEXT: cmovaq %rcx, %rax
+; AVX512DQ-NEXT: vmovq %rax, %xmm1
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: test_cmp_v16f64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vextractf32x4 $3, %zmm2, %xmm4
+; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm5
+; AVX512BW-NEXT: xorl %eax, %eax
+; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT: movq $-1, %rcx
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm6
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm4
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512BW-NEXT: vextractf32x4 $2, %zmm2, %xmm5
+; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm6
+; AVX512BW-NEXT: vucomisd %xmm5, %xmm6
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm7
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512BW-NEXT: vucomisd %xmm5, %xmm6
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm5
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512BW-NEXT: vextractf32x4 $1, %zmm2, %xmm5
+; AVX512BW-NEXT: vextractf32x4 $1, %zmm0, %xmm6
+; AVX512BW-NEXT: vucomisd %xmm5, %xmm6
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm7
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512BW-NEXT: vucomisd %xmm5, %xmm6
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm5
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512BW-NEXT: vucomisd %xmm2, %xmm0
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm6
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512BW-NEXT: vucomisd %xmm2, %xmm0
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm0
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
+; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vextractf32x4 $3, %zmm3, %xmm2
+; AVX512BW-NEXT: vextractf32x4 $3, %zmm1, %xmm4
+; AVX512BW-NEXT: vucomisd %xmm2, %xmm4
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm5
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT: vucomisd %xmm2, %xmm4
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm2
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512BW-NEXT: vextractf32x4 $2, %zmm3, %xmm4
+; AVX512BW-NEXT: vextractf32x4 $2, %zmm1, %xmm5
+; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm6
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm4
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512BW-NEXT: vextractf32x4 $1, %zmm3, %xmm4
+; AVX512BW-NEXT: vextractf32x4 $1, %zmm1, %xmm5
+; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm6
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm4
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512BW-NEXT: vucomisd %xmm3, %xmm1
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovaq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm5
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512BW-NEXT: vucomisd %xmm3, %xmm1
+; AVX512BW-NEXT: cmovaq %rcx, %rax
+; AVX512BW-NEXT: vmovq %rax, %xmm1
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: retq
%1 = fcmp ogt <16 x double> %a0, %a1
ret <16 x i1> %1
}
@@ -2416,207 +2749,612 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cmp_v32f32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vextractf32x4 $3, %zmm2, %xmm4
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm6
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: vucomiss %xmm5, %xmm7
-; AVX512-NEXT: movl $-1, %ecx
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vucomiss %xmm4, %xmm6
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmoval %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm5
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm8
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
-; AVX512-NEXT: vucomiss %xmm7, %xmm5
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm8, %xmm5
-; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512-NEXT: vucomiss %xmm4, %xmm6
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm8
-; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm5
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm7
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; AVX512-NEXT: vucomiss %xmm6, %xmm4
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vucomiss %xmm5, %xmm7
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmoval %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm4
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
-; AVX512-NEXT: vucomiss %xmm6, %xmm4
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
-; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
-; AVX512-NEXT: vucomiss %xmm5, %xmm6
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
-; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8
-; AVX512-NEXT: vextractf32x4 $1, %zmm2, %xmm5
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512-NEXT: vextractf32x4 $1, %zmm0, %xmm7
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; AVX512-NEXT: vucomiss %xmm6, %xmm4
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vucomiss %xmm5, %xmm7
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmoval %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm4
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
-; AVX512-NEXT: vucomiss %xmm6, %xmm4
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
-; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
-; AVX512-NEXT: vucomiss %xmm5, %xmm6
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; AVX512-NEXT: vucomiss %xmm5, %xmm6
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vucomiss %xmm2, %xmm0
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmoval %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm5
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
-; AVX512-NEXT: vucomiss %xmm6, %xmm7
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512-NEXT: vucomiss %xmm2, %xmm0
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm8
-; AVX512-NEXT: vextractf32x4 $3, %zmm3, %xmm2
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm5
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512-NEXT: vucomiss %xmm4, %xmm6
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vucomiss %xmm2, %xmm5
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmoval %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm4
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
-; AVX512-NEXT: vucomiss %xmm6, %xmm7
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512-NEXT: vucomiss %xmm2, %xmm5
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
-; AVX512-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm6
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; AVX512-NEXT: vucomiss %xmm5, %xmm7
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vucomiss %xmm4, %xmm6
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmoval %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm5
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
-; AVX512-NEXT: vucomiss %xmm7, %xmm0
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm0
-; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
-; AVX512-NEXT: vucomiss %xmm4, %xmm5
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vextractf32x4 $1, %zmm3, %xmm0
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512-NEXT: vextractf32x4 $1, %zmm1, %xmm5
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512-NEXT: vucomiss %xmm4, %xmm6
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vucomiss %xmm0, %xmm5
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmoval %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm4
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
-; AVX512-NEXT: vucomiss %xmm6, %xmm7
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512-NEXT: vucomiss %xmm0, %xmm5
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; AVX512-NEXT: vucomiss %xmm4, %xmm5
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vucomiss %xmm3, %xmm1
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmoval %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm4
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
-; AVX512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
-; AVX512-NEXT: vucomiss %xmm5, %xmm6
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmoval %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-NEXT: vucomiss %xmm3, %xmm1
-; AVX512-NEXT: cmoval %ecx, %eax
-; AVX512-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cmp_v32f32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vextractf32x4 $3, %zmm2, %xmm4
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm6
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512F-NEXT: xorl %eax, %eax
+; AVX512F-NEXT: vucomiss %xmm5, %xmm7
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vucomiss %xmm4, %xmm6
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmoval %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm5
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm8
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512F-NEXT: vucomiss %xmm7, %xmm5
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm8, %xmm5
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; AVX512F-NEXT: vucomiss %xmm4, %xmm6
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm5, %xmm8
+; AVX512F-NEXT: vextractf32x4 $2, %zmm2, %xmm5
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm7
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512F-NEXT: vucomiss %xmm6, %xmm4
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vucomiss %xmm5, %xmm7
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmoval %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm4
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512F-NEXT: vucomiss %xmm6, %xmm4
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512F-NEXT: vucomiss %xmm5, %xmm6
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8
+; AVX512F-NEXT: vextractf32x4 $1, %zmm2, %xmm5
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm7
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512F-NEXT: vucomiss %xmm6, %xmm4
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vucomiss %xmm5, %xmm7
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmoval %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm4
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512F-NEXT: vucomiss %xmm6, %xmm4
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512F-NEXT: vucomiss %xmm5, %xmm6
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; AVX512F-NEXT: vucomiss %xmm5, %xmm6
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vucomiss %xmm2, %xmm0
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmoval %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm5
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
+; AVX512F-NEXT: vucomiss %xmm6, %xmm7
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vucomiss %xmm2, %xmm0
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm8
+; AVX512F-NEXT: vextractf32x4 $3, %zmm3, %xmm2
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; AVX512F-NEXT: vextractf32x4 $3, %zmm1, %xmm5
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512F-NEXT: vucomiss %xmm4, %xmm6
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vucomiss %xmm2, %xmm5
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmoval %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm4
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512F-NEXT: vucomiss %xmm6, %xmm7
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512F-NEXT: vucomiss %xmm2, %xmm5
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
+; AVX512F-NEXT: vextractf32x4 $2, %zmm3, %xmm4
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm6
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512F-NEXT: vucomiss %xmm5, %xmm7
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vucomiss %xmm4, %xmm6
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmoval %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm5
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
+; AVX512F-NEXT: vucomiss %xmm7, %xmm0
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm5, %xmm0
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
+; AVX512F-NEXT: vucomiss %xmm4, %xmm5
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-NEXT: vextractf32x4 $1, %zmm3, %xmm0
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512F-NEXT: vextractf32x4 $1, %zmm1, %xmm5
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512F-NEXT: vucomiss %xmm4, %xmm6
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vucomiss %xmm0, %xmm5
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmoval %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm4
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512F-NEXT: vucomiss %xmm6, %xmm7
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512F-NEXT: vucomiss %xmm0, %xmm5
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX512F-NEXT: vucomiss %xmm4, %xmm5
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vucomiss %xmm3, %xmm1
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmoval %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm4
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
+; AVX512F-NEXT: vucomiss %xmm5, %xmm6
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmoval %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512F-NEXT: vucomiss %xmm3, %xmm1
+; AVX512F-NEXT: cmoval %ecx, %eax
+; AVX512F-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_cmp_v32f32:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vextractf32x4 $3, %zmm2, %xmm4
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512DQ-NEXT: vextractf32x4 $3, %zmm0, %xmm6
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512DQ-NEXT: xorl %eax, %eax
+; AVX512DQ-NEXT: vucomiss %xmm5, %xmm7
+; AVX512DQ-NEXT: movl $-1, %ecx
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmoval %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm5
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm8
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512DQ-NEXT: vucomiss %xmm7, %xmm5
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm8, %xmm5
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm5, %xmm8
+; AVX512DQ-NEXT: vextractf32x4 $2, %zmm2, %xmm5
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512DQ-NEXT: vextractf32x4 $2, %zmm0, %xmm7
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512DQ-NEXT: vucomiss %xmm6, %xmm4
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vucomiss %xmm5, %xmm7
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmoval %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm4
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512DQ-NEXT: vucomiss %xmm6, %xmm4
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512DQ-NEXT: vucomiss %xmm5, %xmm6
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8
+; AVX512DQ-NEXT: vextractf32x4 $1, %zmm2, %xmm5
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512DQ-NEXT: vextractf32x4 $1, %zmm0, %xmm7
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512DQ-NEXT: vucomiss %xmm6, %xmm4
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vucomiss %xmm5, %xmm7
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmoval %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm4
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512DQ-NEXT: vucomiss %xmm6, %xmm4
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512DQ-NEXT: vucomiss %xmm5, %xmm6
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; AVX512DQ-NEXT: vucomiss %xmm5, %xmm6
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vucomiss %xmm2, %xmm0
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmoval %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm5
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
+; AVX512DQ-NEXT: vucomiss %xmm6, %xmm7
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512DQ-NEXT: vucomiss %xmm2, %xmm0
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm8
+; AVX512DQ-NEXT: vextractf32x4 $3, %zmm3, %xmm2
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; AVX512DQ-NEXT: vextractf32x4 $3, %zmm1, %xmm5
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vucomiss %xmm2, %xmm5
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmoval %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm4
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512DQ-NEXT: vucomiss %xmm6, %xmm7
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512DQ-NEXT: vucomiss %xmm2, %xmm5
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
+; AVX512DQ-NEXT: vextractf32x4 $2, %zmm3, %xmm4
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512DQ-NEXT: vextractf32x4 $2, %zmm1, %xmm6
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512DQ-NEXT: vucomiss %xmm5, %xmm7
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmoval %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm5
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
+; AVX512DQ-NEXT: vucomiss %xmm7, %xmm0
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm5, %xmm0
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
+; AVX512DQ-NEXT: vucomiss %xmm4, %xmm5
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vextractf32x4 $1, %zmm3, %xmm0
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512DQ-NEXT: vextractf32x4 $1, %zmm1, %xmm5
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vucomiss %xmm0, %xmm5
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmoval %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm4
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512DQ-NEXT: vucomiss %xmm6, %xmm7
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512DQ-NEXT: vucomiss %xmm0, %xmm5
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX512DQ-NEXT: vucomiss %xmm4, %xmm5
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmoval %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm4
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
+; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
+; AVX512DQ-NEXT: vucomiss %xmm5, %xmm6
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmoval %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1
+; AVX512DQ-NEXT: cmoval %ecx, %eax
+; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: test_cmp_v32f32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vextractf32x4 $3, %zmm2, %xmm4
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm6
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512BW-NEXT: xorl %eax, %eax
+; AVX512BW-NEXT: vucomiss %xmm5, %xmm7
+; AVX512BW-NEXT: movl $-1, %ecx
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vucomiss %xmm4, %xmm6
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmoval %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm5
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm8
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
+; AVX512BW-NEXT: vucomiss %xmm7, %xmm5
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm8, %xmm5
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; AVX512BW-NEXT: vucomiss %xmm4, %xmm6
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm5, %xmm8
+; AVX512BW-NEXT: vextractf32x4 $2, %zmm2, %xmm5
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm7
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512BW-NEXT: vucomiss %xmm6, %xmm4
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vucomiss %xmm5, %xmm7
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmoval %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm4
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512BW-NEXT: vucomiss %xmm6, %xmm4
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512BW-NEXT: vucomiss %xmm5, %xmm6
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8
+; AVX512BW-NEXT: vextractf32x4 $1, %zmm2, %xmm5
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512BW-NEXT: vextractf32x4 $1, %zmm0, %xmm7
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; AVX512BW-NEXT: vucomiss %xmm6, %xmm4
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vucomiss %xmm5, %xmm7
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmoval %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm4
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
+; AVX512BW-NEXT: vucomiss %xmm6, %xmm4
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; AVX512BW-NEXT: vucomiss %xmm5, %xmm6
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; AVX512BW-NEXT: vucomiss %xmm5, %xmm6
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vucomiss %xmm2, %xmm0
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmoval %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm5
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
+; AVX512BW-NEXT: vucomiss %xmm6, %xmm7
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BW-NEXT: vucomiss %xmm2, %xmm0
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm8
+; AVX512BW-NEXT: vextractf32x4 $3, %zmm3, %xmm2
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; AVX512BW-NEXT: vextractf32x4 $3, %zmm1, %xmm5
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512BW-NEXT: vucomiss %xmm4, %xmm6
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vucomiss %xmm2, %xmm5
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmoval %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm4
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512BW-NEXT: vucomiss %xmm6, %xmm7
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512BW-NEXT: vucomiss %xmm2, %xmm5
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
+; AVX512BW-NEXT: vextractf32x4 $2, %zmm3, %xmm4
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; AVX512BW-NEXT: vextractf32x4 $2, %zmm1, %xmm6
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; AVX512BW-NEXT: vucomiss %xmm5, %xmm7
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vucomiss %xmm4, %xmm6
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmoval %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm5
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
+; AVX512BW-NEXT: vucomiss %xmm7, %xmm0
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm5, %xmm0
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
+; AVX512BW-NEXT: vucomiss %xmm4, %xmm5
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-NEXT: vextractf32x4 $1, %zmm3, %xmm0
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512BW-NEXT: vextractf32x4 $1, %zmm1, %xmm5
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512BW-NEXT: vucomiss %xmm4, %xmm6
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vucomiss %xmm0, %xmm5
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmoval %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm4
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512BW-NEXT: vucomiss %xmm6, %xmm7
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512BW-NEXT: vucomiss %xmm0, %xmm5
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX512BW-NEXT: vucomiss %xmm4, %xmm5
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vucomiss %xmm3, %xmm1
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmoval %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm4
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
+; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
+; AVX512BW-NEXT: vucomiss %xmm5, %xmm6
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmoval %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BW-NEXT: vucomiss %xmm3, %xmm1
+; AVX512BW-NEXT: cmoval %ecx, %eax
+; AVX512BW-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1
+; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: retq
%1 = fcmp ogt <32 x float> %a0, %a1
ret <32 x i1> %1
}
@@ -2785,136 +3523,398 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cmp_v16i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vextracti32x4 $3, %zmm2, %xmm4
-; AVX512-NEXT: vpextrq $1, %xmm4, %rcx
-; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm5
-; AVX512-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: cmpq %rcx, %rdx
-; AVX512-NEXT: movq $-1, %rcx
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm6
-; AVX512-NEXT: vmovq %xmm4, %rdx
-; AVX512-NEXT: vmovq %xmm5, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm4
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm5
-; AVX512-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm6
-; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm7
-; AVX512-NEXT: vmovq %xmm5, %rdx
-; AVX512-NEXT: vmovq %xmm6, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm5
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512-NEXT: vextracti32x4 $1, %zmm2, %xmm5
-; AVX512-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm6
-; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm7
-; AVX512-NEXT: vmovq %xmm5, %rdx
-; AVX512-NEXT: vmovq %xmm6, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm5
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm6
-; AVX512-NEXT: vmovq %xmm2, %rdx
-; AVX512-NEXT: vmovq %xmm0, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vextracti32x4 $3, %zmm3, %xmm2
-; AVX512-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm5
-; AVX512-NEXT: vmovq %xmm2, %rdx
-; AVX512-NEXT: vmovq %xmm4, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm2
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; AVX512-NEXT: vextracti32x4 $2, %zmm3, %xmm4
-; AVX512-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm5
-; AVX512-NEXT: vpextrq $1, %xmm5, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm6
-; AVX512-NEXT: vmovq %xmm4, %rdx
-; AVX512-NEXT: vmovq %xmm5, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm4
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512-NEXT: vextracti32x4 $1, %zmm3, %xmm4
-; AVX512-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512-NEXT: vextracti32x4 $1, %zmm1, %xmm5
-; AVX512-NEXT: vpextrq $1, %xmm5, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm6
-; AVX512-NEXT: vmovq %xmm4, %rdx
-; AVX512-NEXT: vmovq %xmm5, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm4
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512-NEXT: vpextrq $1, %xmm3, %rdx
-; AVX512-NEXT: vpextrq $1, %xmm1, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgq %rcx, %rdx
-; AVX512-NEXT: vmovq %rdx, %xmm5
-; AVX512-NEXT: vmovq %xmm3, %rdx
-; AVX512-NEXT: vmovq %xmm1, %rsi
-; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: cmovgq %rcx, %rax
-; AVX512-NEXT: vmovq %rax, %xmm1
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cmp_v16i64:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm4
+; AVX512F-NEXT: vpextrq $1, %xmm4, %rcx
+; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm5
+; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512F-NEXT: xorl %eax, %eax
+; AVX512F-NEXT: cmpq %rcx, %rdx
+; AVX512F-NEXT: movq $-1, %rcx
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm6
+; AVX512F-NEXT: vmovq %xmm4, %rdx
+; AVX512F-NEXT: vmovq %xmm5, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm4
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm5
+; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm6
+; AVX512F-NEXT: vpextrq $1, %xmm6, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm7
+; AVX512F-NEXT: vmovq %xmm5, %rdx
+; AVX512F-NEXT: vmovq %xmm6, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm5
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT: vextracti32x4 $1, %zmm2, %xmm5
+; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; AVX512F-NEXT: vpextrq $1, %xmm6, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm7
+; AVX512F-NEXT: vmovq %xmm5, %rdx
+; AVX512F-NEXT: vmovq %xmm6, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm5
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm6
+; AVX512F-NEXT: vmovq %xmm2, %rdx
+; AVX512F-NEXT: vmovq %xmm0, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm0
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm2
+; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx
+; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm5
+; AVX512F-NEXT: vmovq %xmm2, %rdx
+; AVX512F-NEXT: vmovq %xmm4, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm4
+; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx
+; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; AVX512F-NEXT: vpextrq $1, %xmm5, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm6
+; AVX512F-NEXT: vmovq %xmm4, %rdx
+; AVX512F-NEXT: vmovq %xmm5, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm4
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT: vextracti32x4 $1, %zmm3, %xmm4
+; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx
+; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm5
+; AVX512F-NEXT: vpextrq $1, %xmm5, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm6
+; AVX512F-NEXT: vmovq %xmm4, %rdx
+; AVX512F-NEXT: vmovq %xmm5, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm4
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512F-NEXT: vpextrq $1, %xmm3, %rdx
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm5
+; AVX512F-NEXT: vmovq %xmm3, %rdx
+; AVX512F-NEXT: vmovq %xmm1, %rsi
+; AVX512F-NEXT: cmpq %rdx, %rsi
+; AVX512F-NEXT: cmovgq %rcx, %rax
+; AVX512F-NEXT: vmovq %rax, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_cmp_v16i64:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vextracti64x2 $3, %zmm2, %xmm4
+; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rcx
+; AVX512DQ-NEXT: vextracti64x2 $3, %zmm0, %xmm5
+; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512DQ-NEXT: xorl %eax, %eax
+; AVX512DQ-NEXT: cmpq %rcx, %rdx
+; AVX512DQ-NEXT: movq $-1, %rcx
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm6
+; AVX512DQ-NEXT: vmovq %xmm4, %rdx
+; AVX512DQ-NEXT: vmovq %xmm5, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm4
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512DQ-NEXT: vextracti64x2 $2, %zmm2, %xmm5
+; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512DQ-NEXT: vextracti64x2 $2, %zmm0, %xmm6
+; AVX512DQ-NEXT: vpextrq $1, %xmm6, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm7
+; AVX512DQ-NEXT: vmovq %xmm5, %rdx
+; AVX512DQ-NEXT: vmovq %xmm6, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm5
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vextracti64x2 $1, %zmm2, %xmm5
+; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm6
+; AVX512DQ-NEXT: vpextrq $1, %xmm6, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm7
+; AVX512DQ-NEXT: vmovq %xmm5, %rdx
+; AVX512DQ-NEXT: vmovq %xmm6, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm5
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rdx
+; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm6
+; AVX512DQ-NEXT: vmovq %xmm2, %rdx
+; AVX512DQ-NEXT: vmovq %xmm0, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm0
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x2 $3, %zmm3, %xmm2
+; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rdx
+; AVX512DQ-NEXT: vextracti64x2 $3, %zmm1, %xmm4
+; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm5
+; AVX512DQ-NEXT: vmovq %xmm2, %rdx
+; AVX512DQ-NEXT: vmovq %xmm4, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm2
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX512DQ-NEXT: vextracti64x2 $2, %zmm3, %xmm4
+; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx
+; AVX512DQ-NEXT: vextracti64x2 $2, %zmm1, %xmm5
+; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm6
+; AVX512DQ-NEXT: vmovq %xmm4, %rdx
+; AVX512DQ-NEXT: vmovq %xmm5, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm4
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512DQ-NEXT: vextracti64x2 $1, %zmm3, %xmm4
+; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx
+; AVX512DQ-NEXT: vextracti64x2 $1, %zmm1, %xmm5
+; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm6
+; AVX512DQ-NEXT: vmovq %xmm4, %rdx
+; AVX512DQ-NEXT: vmovq %xmm5, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm4
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512DQ-NEXT: vpextrq $1, %xmm3, %rdx
+; AVX512DQ-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgq %rcx, %rdx
+; AVX512DQ-NEXT: vmovq %rdx, %xmm5
+; AVX512DQ-NEXT: vmovq %xmm3, %rdx
+; AVX512DQ-NEXT: vmovq %xmm1, %rsi
+; AVX512DQ-NEXT: cmpq %rdx, %rsi
+; AVX512DQ-NEXT: cmovgq %rcx, %rax
+; AVX512DQ-NEXT: vmovq %rax, %xmm1
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: test_cmp_v16i64:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm4
+; AVX512BW-NEXT: vpextrq $1, %xmm4, %rcx
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm5
+; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512BW-NEXT: xorl %eax, %eax
+; AVX512BW-NEXT: cmpq %rcx, %rdx
+; AVX512BW-NEXT: movq $-1, %rcx
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm6
+; AVX512BW-NEXT: vmovq %xmm4, %rdx
+; AVX512BW-NEXT: vmovq %xmm5, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm4
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm5
+; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm6
+; AVX512BW-NEXT: vpextrq $1, %xmm6, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm7
+; AVX512BW-NEXT: vmovq %xmm5, %rdx
+; AVX512BW-NEXT: vmovq %xmm6, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm5
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm2, %xmm5
+; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; AVX512BW-NEXT: vpextrq $1, %xmm6, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm7
+; AVX512BW-NEXT: vmovq %xmm5, %rdx
+; AVX512BW-NEXT: vmovq %xmm6, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm5
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512BW-NEXT: vpextrq $1, %xmm2, %rdx
+; AVX512BW-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm6
+; AVX512BW-NEXT: vmovq %xmm2, %rdx
+; AVX512BW-NEXT: vmovq %xmm0, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm0
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm2
+; AVX512BW-NEXT: vpextrq $1, %xmm2, %rdx
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm5
+; AVX512BW-NEXT: vmovq %xmm2, %rdx
+; AVX512BW-NEXT: vmovq %xmm4, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm2
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm4
+; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; AVX512BW-NEXT: vpextrq $1, %xmm5, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm6
+; AVX512BW-NEXT: vmovq %xmm4, %rdx
+; AVX512BW-NEXT: vmovq %xmm5, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm4
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm3, %xmm4
+; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm5
+; AVX512BW-NEXT: vpextrq $1, %xmm5, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm6
+; AVX512BW-NEXT: vmovq %xmm4, %rdx
+; AVX512BW-NEXT: vmovq %xmm5, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm4
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512BW-NEXT: vpextrq $1, %xmm3, %rdx
+; AVX512BW-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgq %rcx, %rdx
+; AVX512BW-NEXT: vmovq %rdx, %xmm5
+; AVX512BW-NEXT: vmovq %xmm3, %rdx
+; AVX512BW-NEXT: vmovq %xmm1, %rsi
+; AVX512BW-NEXT: cmpq %rdx, %rsi
+; AVX512BW-NEXT: cmovgq %rcx, %rax
+; AVX512BW-NEXT: vmovq %rax, %xmm1
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: retq
%1 = icmp sgt <16 x i64> %a0, %a1
ret <16 x i1> %1
}
@@ -3252,223 +4252,660 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cmp_v32i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vextracti32x4 $3, %zmm2, %xmm4
-; AVX512-NEXT: vpextrd $1, %xmm4, %ecx
-; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm5
-; AVX512-NEXT: vpextrd $1, %xmm5, %edx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: cmpl %ecx, %edx
-; AVX512-NEXT: movl $-1, %ecx
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vmovd %xmm4, %esi
-; AVX512-NEXT: vmovd %xmm5, %edi
-; AVX512-NEXT: cmpl %esi, %edi
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmovgl %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm6
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512-NEXT: vpextrd $2, %xmm5, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512-NEXT: vpextrd $3, %xmm5, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512-NEXT: vextracti32x4 $2, %zmm2, %xmm5
-; AVX512-NEXT: vpextrd $1, %xmm5, %edx
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm6
-; AVX512-NEXT: vpextrd $1, %xmm6, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vmovd %xmm5, %esi
-; AVX512-NEXT: vmovd %xmm6, %edi
-; AVX512-NEXT: cmpl %esi, %edi
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmovgl %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm7
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
-; AVX512-NEXT: vpextrd $2, %xmm5, %edx
-; AVX512-NEXT: vpextrd $2, %xmm6, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
-; AVX512-NEXT: vpextrd $3, %xmm5, %edx
-; AVX512-NEXT: vpextrd $3, %xmm6, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512-NEXT: vextracti32x4 $1, %zmm2, %xmm5
-; AVX512-NEXT: vpextrd $1, %xmm5, %edx
-; AVX512-NEXT: vextracti32x4 $1, %zmm0, %xmm6
-; AVX512-NEXT: vpextrd $1, %xmm6, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vmovd %xmm5, %esi
-; AVX512-NEXT: vmovd %xmm6, %edi
-; AVX512-NEXT: cmpl %esi, %edi
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmovgl %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm7
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
-; AVX512-NEXT: vpextrd $2, %xmm5, %edx
-; AVX512-NEXT: vpextrd $2, %xmm6, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
-; AVX512-NEXT: vpextrd $3, %xmm5, %edx
-; AVX512-NEXT: vpextrd $3, %xmm6, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
-; AVX512-NEXT: vpextrd $1, %xmm2, %edx
-; AVX512-NEXT: vpextrd $1, %xmm0, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vmovd %xmm2, %esi
-; AVX512-NEXT: vmovd %xmm0, %edi
-; AVX512-NEXT: cmpl %esi, %edi
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmovgl %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm6
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512-NEXT: vpextrd $2, %xmm2, %edx
-; AVX512-NEXT: vpextrd $2, %xmm0, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512-NEXT: vpextrd $3, %xmm2, %edx
-; AVX512-NEXT: vpextrd $3, %xmm0, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vextracti32x4 $3, %zmm3, %xmm2
-; AVX512-NEXT: vpextrd $1, %xmm2, %edx
-; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512-NEXT: vpextrd $1, %xmm4, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vmovd %xmm2, %esi
-; AVX512-NEXT: vmovd %xmm4, %edi
-; AVX512-NEXT: cmpl %esi, %edi
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmovgl %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm5
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512-NEXT: vpextrd $2, %xmm2, %edx
-; AVX512-NEXT: vpextrd $2, %xmm4, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512-NEXT: vpextrd $3, %xmm2, %edx
-; AVX512-NEXT: vpextrd $3, %xmm4, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2
-; AVX512-NEXT: vextracti32x4 $2, %zmm3, %xmm4
-; AVX512-NEXT: vpextrd $1, %xmm4, %edx
-; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm5
-; AVX512-NEXT: vpextrd $1, %xmm5, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vmovd %xmm4, %esi
-; AVX512-NEXT: vmovd %xmm5, %edi
-; AVX512-NEXT: cmpl %esi, %edi
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmovgl %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm6
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512-NEXT: vpextrd $2, %xmm5, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512-NEXT: vpextrd $3, %xmm5, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512-NEXT: vextracti32x4 $1, %zmm3, %xmm4
-; AVX512-NEXT: vpextrd $1, %xmm4, %edx
-; AVX512-NEXT: vextracti32x4 $1, %zmm1, %xmm5
-; AVX512-NEXT: vpextrd $1, %xmm5, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vmovd %xmm4, %esi
-; AVX512-NEXT: vmovd %xmm5, %edi
-; AVX512-NEXT: cmpl %esi, %edi
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmovgl %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm6
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512-NEXT: vpextrd $2, %xmm5, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512-NEXT: vpextrd $3, %xmm5, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512-NEXT: vpextrd $1, %xmm3, %edx
-; AVX512-NEXT: vpextrd $1, %xmm1, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vmovd %xmm3, %esi
-; AVX512-NEXT: vmovd %xmm1, %edi
-; AVX512-NEXT: cmpl %esi, %edi
-; AVX512-NEXT: movl $0, %esi
-; AVX512-NEXT: cmovgl %ecx, %esi
-; AVX512-NEXT: vmovd %esi, %xmm5
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512-NEXT: vpextrd $2, %xmm3, %edx
-; AVX512-NEXT: vpextrd $2, %xmm1, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: movl $0, %edx
-; AVX512-NEXT: cmovgl %ecx, %edx
-; AVX512-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512-NEXT: vpextrd $3, %xmm3, %edx
-; AVX512-NEXT: vpextrd $3, %xmm1, %esi
-; AVX512-NEXT: cmpl %edx, %esi
-; AVX512-NEXT: cmovgl %ecx, %eax
-; AVX512-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cmp_v32i32:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm4
+; AVX512F-NEXT: vpextrd $1, %xmm4, %ecx
+; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm5
+; AVX512F-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512F-NEXT: xorl %eax, %eax
+; AVX512F-NEXT: cmpl %ecx, %edx
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vmovd %xmm4, %esi
+; AVX512F-NEXT: vmovd %xmm5, %edi
+; AVX512F-NEXT: cmpl %esi, %edi
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmovgl %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm6
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512F-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512F-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512F-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512F-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm5
+; AVX512F-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm6
+; AVX512F-NEXT: vpextrd $1, %xmm6, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vmovd %xmm5, %esi
+; AVX512F-NEXT: vmovd %xmm6, %edi
+; AVX512F-NEXT: cmpl %esi, %edi
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmovgl %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm7
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $2, %xmm5, %edx
+; AVX512F-NEXT: vpextrd $2, %xmm6, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $3, %xmm5, %edx
+; AVX512F-NEXT: vpextrd $3, %xmm6, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT: vextracti32x4 $1, %zmm2, %xmm5
+; AVX512F-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; AVX512F-NEXT: vpextrd $1, %xmm6, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vmovd %xmm5, %esi
+; AVX512F-NEXT: vmovd %xmm6, %edi
+; AVX512F-NEXT: cmpl %esi, %edi
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmovgl %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm7
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $2, %xmm5, %edx
+; AVX512F-NEXT: vpextrd $2, %xmm6, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512F-NEXT: vpextrd $3, %xmm5, %edx
+; AVX512F-NEXT: vpextrd $3, %xmm6, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512F-NEXT: vpextrd $1, %xmm2, %edx
+; AVX512F-NEXT: vpextrd $1, %xmm0, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vmovd %xmm2, %esi
+; AVX512F-NEXT: vmovd %xmm0, %edi
+; AVX512F-NEXT: cmpl %esi, %edi
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmovgl %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm6
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512F-NEXT: vpextrd $2, %xmm2, %edx
+; AVX512F-NEXT: vpextrd $2, %xmm0, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512F-NEXT: vpextrd $3, %xmm2, %edx
+; AVX512F-NEXT: vpextrd $3, %xmm0, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm2
+; AVX512F-NEXT: vpextrd $1, %xmm2, %edx
+; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; AVX512F-NEXT: vpextrd $1, %xmm4, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vmovd %xmm2, %esi
+; AVX512F-NEXT: vmovd %xmm4, %edi
+; AVX512F-NEXT: cmpl %esi, %edi
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmovgl %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm5
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512F-NEXT: vpextrd $2, %xmm2, %edx
+; AVX512F-NEXT: vpextrd $2, %xmm4, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512F-NEXT: vpextrd $3, %xmm2, %edx
+; AVX512F-NEXT: vpextrd $3, %xmm4, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2
+; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm4
+; AVX512F-NEXT: vpextrd $1, %xmm4, %edx
+; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; AVX512F-NEXT: vpextrd $1, %xmm5, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vmovd %xmm4, %esi
+; AVX512F-NEXT: vmovd %xmm5, %edi
+; AVX512F-NEXT: cmpl %esi, %edi
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmovgl %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm6
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512F-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512F-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512F-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512F-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT: vextracti32x4 $1, %zmm3, %xmm4
+; AVX512F-NEXT: vpextrd $1, %xmm4, %edx
+; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm5
+; AVX512F-NEXT: vpextrd $1, %xmm5, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vmovd %xmm4, %esi
+; AVX512F-NEXT: vmovd %xmm5, %edi
+; AVX512F-NEXT: cmpl %esi, %edi
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmovgl %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm6
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512F-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512F-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512F-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512F-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512F-NEXT: vpextrd $1, %xmm3, %edx
+; AVX512F-NEXT: vpextrd $1, %xmm1, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vmovd %xmm3, %esi
+; AVX512F-NEXT: vmovd %xmm1, %edi
+; AVX512F-NEXT: cmpl %esi, %edi
+; AVX512F-NEXT: movl $0, %esi
+; AVX512F-NEXT: cmovgl %ecx, %esi
+; AVX512F-NEXT: vmovd %esi, %xmm5
+; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512F-NEXT: vpextrd $2, %xmm3, %edx
+; AVX512F-NEXT: vpextrd $2, %xmm1, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: movl $0, %edx
+; AVX512F-NEXT: cmovgl %ecx, %edx
+; AVX512F-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512F-NEXT: vpextrd $3, %xmm3, %edx
+; AVX512F-NEXT: vpextrd $3, %xmm1, %esi
+; AVX512F-NEXT: cmpl %edx, %esi
+; AVX512F-NEXT: cmovgl %ecx, %eax
+; AVX512F-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_cmp_v32i32:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vextracti32x4 $3, %zmm2, %xmm4
+; AVX512DQ-NEXT: vpextrd $1, %xmm4, %ecx
+; AVX512DQ-NEXT: vextracti32x4 $3, %zmm0, %xmm5
+; AVX512DQ-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512DQ-NEXT: xorl %eax, %eax
+; AVX512DQ-NEXT: cmpl %ecx, %edx
+; AVX512DQ-NEXT: movl $-1, %ecx
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vmovd %xmm4, %esi
+; AVX512DQ-NEXT: vmovd %xmm5, %edi
+; AVX512DQ-NEXT: cmpl %esi, %edi
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmovgl %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm6
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512DQ-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512DQ-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512DQ-NEXT: vextracti32x4 $2, %zmm2, %xmm5
+; AVX512DQ-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm6
+; AVX512DQ-NEXT: vpextrd $1, %xmm6, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vmovd %xmm5, %esi
+; AVX512DQ-NEXT: vmovd %xmm6, %edi
+; AVX512DQ-NEXT: cmpl %esi, %edi
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmovgl %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm7
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512DQ-NEXT: vpextrd $2, %xmm5, %edx
+; AVX512DQ-NEXT: vpextrd $2, %xmm6, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512DQ-NEXT: vpextrd $3, %xmm5, %edx
+; AVX512DQ-NEXT: vpextrd $3, %xmm6, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vextracti32x4 $1, %zmm2, %xmm5
+; AVX512DQ-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512DQ-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; AVX512DQ-NEXT: vpextrd $1, %xmm6, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vmovd %xmm5, %esi
+; AVX512DQ-NEXT: vmovd %xmm6, %edi
+; AVX512DQ-NEXT: cmpl %esi, %edi
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmovgl %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm7
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512DQ-NEXT: vpextrd $2, %xmm5, %edx
+; AVX512DQ-NEXT: vpextrd $2, %xmm6, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512DQ-NEXT: vpextrd $3, %xmm5, %edx
+; AVX512DQ-NEXT: vpextrd $3, %xmm6, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512DQ-NEXT: vpextrd $1, %xmm2, %edx
+; AVX512DQ-NEXT: vpextrd $1, %xmm0, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vmovd %xmm2, %esi
+; AVX512DQ-NEXT: vmovd %xmm0, %edi
+; AVX512DQ-NEXT: cmpl %esi, %edi
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmovgl %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm6
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpextrd $2, %xmm2, %edx
+; AVX512DQ-NEXT: vpextrd $2, %xmm0, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpextrd $3, %xmm2, %edx
+; AVX512DQ-NEXT: vpextrd $3, %xmm0, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vextracti32x4 $3, %zmm3, %xmm2
+; AVX512DQ-NEXT: vpextrd $1, %xmm2, %edx
+; AVX512DQ-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; AVX512DQ-NEXT: vpextrd $1, %xmm4, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vmovd %xmm2, %esi
+; AVX512DQ-NEXT: vmovd %xmm4, %edi
+; AVX512DQ-NEXT: cmpl %esi, %edi
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmovgl %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm5
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT: vpextrd $2, %xmm2, %edx
+; AVX512DQ-NEXT: vpextrd $2, %xmm4, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT: vpextrd $3, %xmm2, %edx
+; AVX512DQ-NEXT: vpextrd $3, %xmm4, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2
+; AVX512DQ-NEXT: vextracti32x4 $2, %zmm3, %xmm4
+; AVX512DQ-NEXT: vpextrd $1, %xmm4, %edx
+; AVX512DQ-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; AVX512DQ-NEXT: vpextrd $1, %xmm5, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vmovd %xmm4, %esi
+; AVX512DQ-NEXT: vmovd %xmm5, %edi
+; AVX512DQ-NEXT: cmpl %esi, %edi
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmovgl %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm6
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512DQ-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512DQ-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512DQ-NEXT: vextracti32x4 $1, %zmm3, %xmm4
+; AVX512DQ-NEXT: vpextrd $1, %xmm4, %edx
+; AVX512DQ-NEXT: vextracti32x4 $1, %zmm1, %xmm5
+; AVX512DQ-NEXT: vpextrd $1, %xmm5, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vmovd %xmm4, %esi
+; AVX512DQ-NEXT: vmovd %xmm5, %edi
+; AVX512DQ-NEXT: cmpl %esi, %edi
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmovgl %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm6
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512DQ-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512DQ-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512DQ-NEXT: vpextrd $1, %xmm3, %edx
+; AVX512DQ-NEXT: vpextrd $1, %xmm1, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vmovd %xmm3, %esi
+; AVX512DQ-NEXT: vmovd %xmm1, %edi
+; AVX512DQ-NEXT: cmpl %esi, %edi
+; AVX512DQ-NEXT: movl $0, %esi
+; AVX512DQ-NEXT: cmovgl %ecx, %esi
+; AVX512DQ-NEXT: vmovd %esi, %xmm5
+; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT: vpextrd $2, %xmm3, %edx
+; AVX512DQ-NEXT: vpextrd $2, %xmm1, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: movl $0, %edx
+; AVX512DQ-NEXT: cmovgl %ecx, %edx
+; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512DQ-NEXT: vpextrd $3, %xmm3, %edx
+; AVX512DQ-NEXT: vpextrd $3, %xmm1, %esi
+; AVX512DQ-NEXT: cmpl %edx, %esi
+; AVX512DQ-NEXT: cmovgl %ecx, %eax
+; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: test_cmp_v32i32:
+; AVX512BW: # BB#0:
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm4
+; AVX512BW-NEXT: vpextrd $1, %xmm4, %ecx
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm5
+; AVX512BW-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512BW-NEXT: xorl %eax, %eax
+; AVX512BW-NEXT: cmpl %ecx, %edx
+; AVX512BW-NEXT: movl $-1, %ecx
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vmovd %xmm4, %esi
+; AVX512BW-NEXT: vmovd %xmm5, %edi
+; AVX512BW-NEXT: cmpl %esi, %edi
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmovgl %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm6
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512BW-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512BW-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm5
+; AVX512BW-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm6
+; AVX512BW-NEXT: vpextrd $1, %xmm6, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vmovd %xmm5, %esi
+; AVX512BW-NEXT: vmovd %xmm6, %edi
+; AVX512BW-NEXT: cmpl %esi, %edi
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmovgl %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm7
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT: vpextrd $2, %xmm5, %edx
+; AVX512BW-NEXT: vpextrd $2, %xmm6, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT: vpextrd $3, %xmm5, %edx
+; AVX512BW-NEXT: vpextrd $3, %xmm6, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm2, %xmm5
+; AVX512BW-NEXT: vpextrd $1, %xmm5, %edx
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; AVX512BW-NEXT: vpextrd $1, %xmm6, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vmovd %xmm5, %esi
+; AVX512BW-NEXT: vmovd %xmm6, %edi
+; AVX512BW-NEXT: cmpl %esi, %edi
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmovgl %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm7
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT: vpextrd $2, %xmm5, %edx
+; AVX512BW-NEXT: vpextrd $2, %xmm6, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
+; AVX512BW-NEXT: vpextrd $3, %xmm5, %edx
+; AVX512BW-NEXT: vpextrd $3, %xmm6, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
+; AVX512BW-NEXT: vpextrd $1, %xmm2, %edx
+; AVX512BW-NEXT: vpextrd $1, %xmm0, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vmovd %xmm2, %esi
+; AVX512BW-NEXT: vmovd %xmm0, %edi
+; AVX512BW-NEXT: cmpl %esi, %edi
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmovgl %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm6
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT: vpextrd $2, %xmm2, %edx
+; AVX512BW-NEXT: vpextrd $2, %xmm0, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT: vpextrd $3, %xmm2, %edx
+; AVX512BW-NEXT: vpextrd $3, %xmm0, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm2
+; AVX512BW-NEXT: vpextrd $1, %xmm2, %edx
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4
+; AVX512BW-NEXT: vpextrd $1, %xmm4, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vmovd %xmm2, %esi
+; AVX512BW-NEXT: vmovd %xmm4, %edi
+; AVX512BW-NEXT: cmpl %esi, %edi
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmovgl %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm5
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT: vpextrd $2, %xmm2, %edx
+; AVX512BW-NEXT: vpextrd $2, %xmm4, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT: vpextrd $3, %xmm2, %edx
+; AVX512BW-NEXT: vpextrd $3, %xmm4, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm4
+; AVX512BW-NEXT: vpextrd $1, %xmm4, %edx
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5
+; AVX512BW-NEXT: vpextrd $1, %xmm5, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vmovd %xmm4, %esi
+; AVX512BW-NEXT: vmovd %xmm5, %edi
+; AVX512BW-NEXT: cmpl %esi, %edi
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmovgl %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm6
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512BW-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512BW-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm3, %xmm4
+; AVX512BW-NEXT: vpextrd $1, %xmm4, %edx
+; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm5
+; AVX512BW-NEXT: vpextrd $1, %xmm5, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vmovd %xmm4, %esi
+; AVX512BW-NEXT: vmovd %xmm5, %edi
+; AVX512BW-NEXT: cmpl %esi, %edi
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmovgl %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm6
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT: vpextrd $2, %xmm4, %edx
+; AVX512BW-NEXT: vpextrd $2, %xmm5, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
+; AVX512BW-NEXT: vpextrd $3, %xmm4, %edx
+; AVX512BW-NEXT: vpextrd $3, %xmm5, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
+; AVX512BW-NEXT: vpextrd $1, %xmm3, %edx
+; AVX512BW-NEXT: vpextrd $1, %xmm1, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vmovd %xmm3, %esi
+; AVX512BW-NEXT: vmovd %xmm1, %edi
+; AVX512BW-NEXT: cmpl %esi, %edi
+; AVX512BW-NEXT: movl $0, %esi
+; AVX512BW-NEXT: cmovgl %ecx, %esi
+; AVX512BW-NEXT: vmovd %esi, %xmm5
+; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT: vpextrd $2, %xmm3, %edx
+; AVX512BW-NEXT: vpextrd $2, %xmm1, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: movl $0, %edx
+; AVX512BW-NEXT: cmovgl %ecx, %edx
+; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT: vpextrd $3, %xmm3, %edx
+; AVX512BW-NEXT: vpextrd $3, %xmm1, %esi
+; AVX512BW-NEXT: cmpl %edx, %esi
+; AVX512BW-NEXT: cmovgl %ecx, %eax
+; AVX512BW-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1
+; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: retq
%1 = icmp sgt <32 x i32> %a0, %a1
ret <32 x i1> %1
}
@@ -4342,291 +5779,987 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cmp_v64i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3
-; AVX512-NEXT: vpmovsxwd %ymm3, %zmm3
-; AVX512-NEXT: vpslld $31, %zmm3, %zmm3
-; AVX512-NEXT: vptestmd %zmm3, %zmm3, %k0
-; AVX512-NEXT: kshiftlw $14, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: kshiftlw $15, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm3
-; AVX512-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $13, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $12, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $11, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $10, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $9, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $8, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $7, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $6, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $5, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $4, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $3, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $2, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftlw $1, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512-NEXT: kshiftrw $15, %k0, %k0
-; AVX512-NEXT: kmovw %k0, %eax
-; AVX512-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2
-; AVX512-NEXT: vpmovsxwd %ymm2, %zmm2
-; AVX512-NEXT: vpslld $31, %zmm2, %zmm2
-; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k0
-; AVX512-NEXT: kshiftlw $14, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: kshiftlw $15, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm2
-; AVX512-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $13, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $12, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $11, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $10, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $9, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $8, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $7, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $6, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $5, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $4, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $3, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $2, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftlw $1, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512-NEXT: kshiftrw $15, %k0, %k0
-; AVX512-NEXT: kmovw %k0, %eax
-; AVX512-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512-NEXT: vpsllw $7, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512-NEXT: vpxor %ymm6, %ymm6, %ymm6
-; AVX512-NEXT: vpcmpgtb %ymm2, %ymm6, %ymm2
-; AVX512-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1
-; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1
-; AVX512-NEXT: vpslld $31, %zmm1, %zmm1
-; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512-NEXT: kshiftlw $14, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: kshiftlw $15, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm1
-; AVX512-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $13, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $12, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $11, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $10, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $9, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $8, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $7, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $6, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $5, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $4, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $3, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $2, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftlw $1, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX512-NEXT: kshiftrw $15, %k0, %k0
-; AVX512-NEXT: kmovw %k0, %eax
-; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX512-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0
-; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512-NEXT: kshiftlw $14, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: kshiftlw $15, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
-; AVX512-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $13, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $12, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $11, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $10, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $9, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $8, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $7, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $6, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $5, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $4, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $3, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $2, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftlw $1, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovw %k1, %eax
-; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kshiftrw $15, %k0, %k0
-; AVX512-NEXT: kmovw %k0, %eax
-; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512-NEXT: vpcmpgtb %ymm0, %ymm6, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<kill>
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cmp_v64i16:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm3
+; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3
+; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm3
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm2
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpxor %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm1
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $14, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: kshiftlw $15, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %ecx
+; AVX512F-NEXT: vmovd %ecx, %xmm0
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $13, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $11, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $10, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $9, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $8, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $7, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $6, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $5, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $4, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $3, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $2, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftlw $1, %k0, %k1
+; AVX512F-NEXT: kshiftrw $15, %k1, %k1
+; AVX512F-NEXT: kmovw %k1, %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: kshiftrw $15, %k0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<kill>
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: test_cmp_v64i16:
+; AVX512DQ: # BB#0:
+; AVX512DQ-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpmovsxwd %ymm3, %zmm3
+; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3
+; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0
+; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %ecx
+; AVX512DQ-NEXT: vmovd %ecx, %xmm3
+; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT: kmovw %k0, %eax
+; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0
+; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %ecx
+; AVX512DQ-NEXT: vmovd %ecx, %xmm2
+; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1
+; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT: kmovw %k1, %eax
+; A